Example #1
0
    def _initHiddenStates(self):
        hiddenNodes = [
            n for n in self.network
            if not self.network.node[n]['nodeObj'].bMeasured
        ]
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()

        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(
                NamedMatrix(npMatrix=tmp,
                            colnames=colnames,
                            rownames=caseNames))

            #initialize phos state based on the observed fluo
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:, nodeIndx] = tmp

                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames():
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(
                            node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp
Example #2
0
    def __init__(self,
                 nodeFile,
                 dataMatrixFile,
                 perturbMatrix=None,
                 missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1

        self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
        #        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception(
                "Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s + 'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile

        if perturbMatrix:
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances

        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) == nCases
            if np.any(allMissing):
                raise Exception("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(
                lambda s: s + 'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines
                   ) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception("Failed to open the file containing nodes")
            return

        print "Creating network"
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')

            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein

            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein,
                                      nodeObj=SigNetNode(
                                          protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody,
                                  nodeObj=SigNetNode(antibody,
                                                     'PHOSPHORYLATIONSTATE',
                                                     False))
            self.network.add_node(fluo,
                                  nodeObj=SigNetNode(fluo, 'FLUORESCENCE',
                                                     True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)

        for perturb in perturbInstances:
            self.network.add_node(perturb,
                                  nodeObj=SigNetNode(perturb, 'PERTURBATION',
                                                     True))

        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)
Example #3
0
class PyGibbCAMP:
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self,
                 nodeFile,
                 dataMatrixFile,
                 perturbMatrix=None,
                 missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1

        self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
        #        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception(
                "Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s + 'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile

        if perturbMatrix:
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances

        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) == nCases
            if np.any(allMissing):
                raise Exception("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(
                lambda s: s + 'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines
                   ) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception("Failed to open the file containing nodes")
            return

        print "Creating network"
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')

            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein

            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein,
                                      nodeObj=SigNetNode(
                                          protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody,
                                  nodeObj=SigNetNode(antibody,
                                                     'PHOSPHORYLATIONSTATE',
                                                     False))
            self.network.add_node(fluo,
                                  nodeObj=SigNetNode(fluo, 'FLUORESCENCE',
                                                     True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)

        for perturb in perturbInstances:
            self.network.add_node(perturb,
                                  nodeObj=SigNetNode(perturb, 'PERTURBATION',
                                                     True))

        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)

    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a
    #  vector of real numbers.
    #
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network:
            self._initNodeParams(nodeId)

    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames(
            ):
                nodeData = self.obsData.getValuesByCol(nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(
                    nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k=2)
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c, :] = np.array(mixGaussians[2])
                nodeObj.sigmas[c, :] = np.array(mixGaussians[3])
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params = None

    ## Initialize latent variables
    #
    #
    def _initHiddenStates(self):
        hiddenNodes = [
            n for n in self.network
            if not self.network.node[n]['nodeObj'].bMeasured
        ]
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()

        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(
                NamedMatrix(npMatrix=tmp,
                            colnames=colnames,
                            rownames=caseNames))

            #initialize phos state based on the observed fluo
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:, nodeIndx] = tmp

                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames():
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(
                            node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp

    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and
    #  model parameters.
    def calcEvidenceLikelihood(self):
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data)
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack(
                    (np.ones(nCases),
                     self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (
                    1 + np.exp(-np.dot(predStates, nodeObj.params[c, :])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(
                    np.float).eps

                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))

            loglikelihood /= self.nChains
            return loglikelihood

    ## Perform graph search
    def trainGibbsEM(self,
                     nChains=10,
                     alpha=0.1,
                     nParents=4,
                     nSamples=5,
                     pickleDumpFile=None,
                     maxIter=1000):
        self.nChains = nChains
        self.alpha = alpha
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents

        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(
                self.alpha) + ".pickle"

        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId][
                    'nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames(
                    ):
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()

        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(
                np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(
            self.nChains) + "; nSamples = " + str(
                self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0

        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter):

            # E-step of EM
            self._updateActivationStates()
            if (nIter + 1) % 2 == 0:  # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] += self.nodeStates[c].data

            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:
                sampleCount = 0
                # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples,
                                          self.expectedStates)
                self._updteParams(self.alpha, nparents=self.nParents)

                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)
                print "nIter: " + str(
                    nIter +
                    1) + "; log likelihood of evidence: " + str(likelihood)

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except:
                        raise Exception("Cannot create pickle dumpfile " +
                                        self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break

                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(
                        np.shape(self.nodeStates[c].data))

        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self

    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False

        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml) / abs(ml)
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes.
        activationNode = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE'
        ]

        for nodeId in activationNode:
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)

                # sample states of current node based on the prob, and update
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState

                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(
                            condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState == 1,
                                                indx] = state

    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception(
                "Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents
            # calculate p(curNode = 1 | parents);
            nodeParams = nodeObj.params[c, :]
            predStates = np.column_stack(
                (np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
            pOneCondOnParents = 1 / (1 +
                                     np.exp(-np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0

        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)

                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c, :]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack(
                    (np.ones(nCases), childPredStates
                     ))  # padding data with a column ones as bias

                # Set the state of current node to ones
                curNodePosInPredList = childPreds.index(
                    nodeId) + 1  # offset by 1 because padding
                if childNodeParams[
                        curNodePosInPredList] == 0:  # not an real edge
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)
                pChildCondCurNodeOnes = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 1] -= np.finfo(
                    np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 0] += np.finfo(
                    np.float).eps
                logProbChildCondOne += np.log(curChildStates *
                                              pChildCondCurNodeOnes +
                                              (1 - curChildStates) *
                                              (1 - pChildCondCurNodeOnes))

                # set the state of the current node (nodeId) to zeros
                childPredStates[:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       1] -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       0] += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates *
                                                  pChildCondCurNodeZeros +
                                                  (1 - curChildStates) *
                                                  (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node
        curNodeMarginal = 1 / (
            1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros -
                       logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal

    def parseGlmnetCoef(self, glmnet_res):
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]

        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception(
                "'parse_glmnet_res' could not determine the dims of beta")
        nVariables, nLambda = map(int, dimStr.split(' x '))
        betaMatrix = np.zeros((nVariables, nLambda), dtype=np.float)

        # glmnet print beta matrix in mulitple blocks with
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = -blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))])
            if rowIndx == 1:
                curBlockColStart += blockSize

            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx,
                               curBlockColStart + j] = float(fields[j])

        return a0, betaMatrix

    def _updteParams(self, alpha=0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents

        for nodeId in self.network:
            nodeObj = self.network.node[nodeId]['nodeObj']
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)

            for c in range(self.nChains):
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))
                x = np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId)

                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)),
                            np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1
                    y[rIndx] = 0
                elif sum(map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1
                y = robjects.vectors.IntVector(y)

                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 0
                allZeros = np.where(
                    np.sum(np.ones(np.shape(x)) - x, 0) == nCases)
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 1

                # call logistic regression using glmnet from Rpy
                fit = glmnet(x, y, alpha=alpha, family="binomial", intercept=0)
                nodeObj.fitRes.append(fit)

                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit)
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1

                myparams = betaMatrix[:, j]
                if sum(myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))
                    myparams[
                        np.abs(myparams) < sortedParams[-self.nParents]] = 0.

                nodeObj.params[c, :] = myparams

    def getStimuliSpecificNet(self, stimulus):
        self.stimuli = [
            'EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'
        ]
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus +
                            "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli),
                              1) == 0

        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(
                    nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(
                    nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues),
                                       robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes
        # Edge indicates the impact
        tmpNet = nx.DiGraph()
        for u, v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0)
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception("Bug in retrieving parameters of node v " +
                                    u)
                paramZeros = np.sum(
                    self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx + 1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0

                for ab in self.dictProteinToAntibody[u]:
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params,
                                      0)
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception(
                            "Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]

#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])
                    if vParams[uIndx + 1] > 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="+",
                                        betaValue=vParams[uIndx + 1])
                    elif vParams[uIndx + 1] < 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="-",
                                        betaValue=vParams[uIndx + 1])

        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)

            if len(leafNodes) == 0:
                break

            for leaf in leafNodes:
                tmpNet.remove_node(leaf)

        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet

    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)

        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], 
              v0=0.2, ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, rowBegin=0, rowEnd = None):
    """ 
    calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength)
    
    Calculate the causal scores between each pair of SGA and DEG observed in each tumor
    
    Inputs:
        mutcnaMatrixFN      A file containing a N x G binary matrix containing the mutation and CNA 
                            data of all tumors.  N is the number of tumors and 
                            G is number of total number of unique genes.  For a
                            tumor, genes that have SGAs are indicated by "1"s and "0" 
                            otherwise. 
        degMatrixFN         A file contains a N x G' binary matrix representing DEG
                            status.  A "1" indicate a gene is differentially expressed
                            in a tumor.
        
        alphaIJList         A list of Dirichlet hyperparameters defining the prior
                            that a mutation event occurs
                            
        alphaIJKList        A list of Dirichlet hyperparameters for caulate the prior
                            of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0;
                            alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0;
                            alphaIJK[3]: mut == 1 && deg == 1
                            
        v0                  A float scalar indicate the prior probability that a DEG
                            is caused by a non-SGA factor 
        
        ppiDict             A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary)
        
        dictGeneLength      A dictionary keeps the length of each of G genes in the 
                            mutcnaMatrix
    
    rowBegin, rowEnd        These two arguments control allow user to choose which block out of all tumors (defined by the two 
                row numbers) will be processes in by this function.  This can be used to process
                mulitple block in a parallel fashion.
    """
    
    # read in data in the form of NamedMatrix 
    try:
        mutcnaMatrix  = NamedMatrix(mutcnaMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % mutcnaMatrixFN
        sys.exit() 
        
    try:
        degMatrix = NamedMatrix(degMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % degMatrixFN
        sys.exit()
        
    exprsTumorNames = [x.replace("\"", "") for x in degMatrix.getRownames()]
    mutTumorNames = [x.replace("\"", "") for x in mutcnaMatrix.getRownames()]
    if exprsTumorNames != mutTumorNames:
        print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!"
        print degMatrix.getRownames()
        print mutcnaMatrix.getRownames()
        sys.exit()
    
    if  not dictGeneLength :
        print "Gene length dictionary not provided, quit\n"
        sys.exit()
                        
    tumorNames = degMatrix.getRownames()
    nTumors, nMutGenes = mutcnaMatrix.shape()
    
    mutGeneNames = mutcnaMatrix.getColnames()
    degGeneNames = degMatrix.getColnames()
    
    # now we iterate through each tumor to infer the causal relationship between each 
    # pair of mut - deg
    # loop through individual tumors and calculate the causal scores between each pair of SGA and DEG    
    if not rowEnd:
        rowEnd = nTumors - 1
    else:
        if rowEnd >= nTumors:
            rowEnd = nTumors - 1
        elif rowEnd < rowBegin:
            print "Invalid rowEnd < rowBegin arguments given."
            sys.exit()

    if rowBegin > rowEnd:
        print "Invlid rowBegin > rowEnd argument given."
        sys.exit()

    print "Done with loading data, start processing tumor " + str(rowBegin)
    for t in range(rowBegin, rowEnd):
        #print pacifier
        if t % 50 == 0:
            print "Processed %s tumors" % str(t)
        
        # collect data related to DEGs.  Identify the genes that are differentially expressed in a tumor,
        # then collect
        degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1]
        tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx]
        nTumorDEGs = len(degGeneIndx)  # corresponding to n, the number of DEGs in a given tumor
        tumorDEGMatrix = degMatrix.data[:,degGeneIndx]
        
        # collect data related to mutations
        tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1]
        if len(tumormutGeneIndx) < 2:
            print tumorNames[t] + " has less than 2 mutations, skip."
            continue
        tumorMutGenes = [mutGeneNames[i] for i in tumormutGeneIndx]        
      
        # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t
        # check if special operations to create combinations of SGA events are needed.  If combination operation is needed, 
        # new combined muation matrix will be created         
        if opFlag == AND:
            tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames)
            tumorNamedMatrix = createANDComb(tmpNamedMat, opFlag)
            if not tumorNamedMatrix:  # this tumor do not have any joint mutations that is oberved in 2% of all tumors
                continue
            tumorMutMatrix = tumorNamedMatrix.data
            tumorMutGenes = tumorNamedMatrix.colnames           
        elif opFlag == OR:
            tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix)      
        else:            
            tumorMutMatrix = mutcnaMatrix.data[:,  tumormutGeneIndx]
           
        ## check operation options:  1) orginal, do nothing and contiue
        # otherwise creat combinary matrix using the tumorMutMatrix 
        # createANDCombMatrix(tumorMutMatrix, operationFlag)
        if not opFlag:
            lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0)  # a m-dimension vector with m being number of mutations
        else:
            #print tumorMutGenes[:10]
            if opFlag == AND:
                lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0)
            elif opFlag == OR:
                lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0)
            
        tumorMutGenes.append('A0')
        
        # calculate the pairwise likelihood that an SGA causes a DEG
        tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix,  alphaIJKList)        
        # Calculate the likelihood of expression data conditioning on A0, and then stack to 
        # the LnFScore, equivalent to adding a column of '1' to represent the A0 in tumorMutMatrix
        nullFscore = calcNullF(tumorDEGMatrix, alphaNull)
        tumorLnFScore = np.vstack((tumorLnFScore, nullFscore))  #check out this later
               
        # calcualte the prior probability that any of mutated genes can be a cause for a DEG,
        # tile it up to make an nTumorMutGenes x nTumorDEG matrix
        tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T
        
        lnFScore = add(tumorLnFScore, tumorMutPriorMatrix)
        
        # now we need to caclculate the normalized lnFScore so that each         
        columnAccumLogSum = np.zeros(nTumorDEGs)        
        for col in range(nTumorDEGs):
            currLogSum = np.NINF
            for j in range(lnFScore.shape[0]):
                if lnFScore[j,col] == np.NINF:
                    continue
                currLogSum = logSum(currLogSum, lnFScore[j,col])             
            columnAccumLogSum[col] = currLogSum
                
        normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1))      

        posterior = np.exp(add(lnFScore, - normalizer))
        
        #write out the results        
        tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes)
        if "\"" in tumorNames[t]:
            tumorNames[t] = tumorNames[t].replace("\"", "")    
        tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
Example #5
0
    def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1
        
        self.dictPerturbEffect = {'AKT1' : [('GSK690693',	0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
#        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile
        
        if perturbMatrix:        
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances
                    
        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) ==  nCases
            if np.any(allMissing):
                raise Exception ("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception( "Failed to open the file containing nodes")
            return
            
        print "Creating network"          
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')
            
            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein
            
            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False))
            self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)
        
        for perturb in perturbInstances:
            self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True))                
            
        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers 
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)
Example #6
0
class PyGibbCAMP:  
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The 
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node 
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1
        
        self.dictPerturbEffect = {'AKT1' : [('GSK690693',	0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
#        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile
        
        if perturbMatrix:        
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances
                    
        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) ==  nCases
            if np.any(allMissing):
                raise Exception ("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception( "Failed to open the file containing nodes")
            return
            
        print "Creating network"          
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')
            
            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein
            
            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False))
            self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)
        
        for perturb in perturbInstances:
            self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True))                
            
        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers 
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)
            
        
    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.  
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a 
    #  vector of real numbers.
    # 
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network: 
            self._initNodeParams(nodeId)
            
    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':                
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames():
                nodeData = self.obsData.getValuesByCol( nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):   
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 )
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c,:] = np.array(mixGaussians[2])
                nodeObj.sigmas[c,:] = np.array(mixGaussians[3])            
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params  = None
                
    
    ## Initialize latent variables
    #    
    #
    def _initHiddenStates(self):
        hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured]
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()
        
        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames))
            
            #initialize phos state based on the observed fluo 
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])                    
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:,nodeIndx] = tmp
                
                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames(): 
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp
                    
        
        
    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and 
    #  model parameters.            
    def calcEvidenceLikelihood(self):
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data) 
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps
                
                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))
                
            loglikelihood /= self.nChains
            return loglikelihood
        
    ## Perform graph search
    def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000):
        self.nChains = nChains
        self.alpha = alpha  
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents
        
        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) +  ".pickle"  
        
        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames():
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge        
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()        
        
        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):                  
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0
        
        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0"  + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter): 
                
            # E-step of EM
            self._updateActivationStates()            
            if  (nIter+1) % 2 == 0: # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] +=  self.nodeStates[c].data                
                
            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:                    
                sampleCount = 0
                 # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates)
                self._updteParams(self.alpha, nparents = self.nParents)
                
                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)   
                print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood)                    

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except: 
                        raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break
                
                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data))
                
        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self  
            
    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False
            
        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml ) / abs(ml)        
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes. 
        activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE']
                    
        for nodeId in activationNode: 
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)
                
                # sample states of current node based on the prob, and update 
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState
                
                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState==1, indx] = state
                        
            
    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)        

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)        
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents  
            # calculate p(curNode = 1 | parents);                 
            nodeParams = nodeObj.params[c,:] 
            predStates =  np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 
            pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents  = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children 
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0
        
        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:  
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)                    
                
                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c,:]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias

                # Set the state of current node to ones 
                curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding 
                if childNodeParams[curNodePosInPredList] == 0:  # not an real edge 
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)                
                pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps
                logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes))
                    
                # set the state of the current node (nodeId) to zeros 
                childPredStates [:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) 
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==1]  -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==0]  += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node 
        curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal
    

    def parseGlmnetCoef(self, glmnet_res):        
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]
        
        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception("'parse_glmnet_res' could not determine the dims of beta")
        nVariables , nLambda = map(int, dimStr.split(' x ')) 
        betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float)
        
        # glmnet print beta matrix in mulitple blocks with 
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = - blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))]) 
            if rowIndx == 1:
                curBlockColStart += blockSize
                
            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j])                 
                            
        return a0, betaMatrix       
      
        
    def _updteParams(self, alpha = 0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents
        
        for nodeId in self.network:     
            nodeObj = self.network.node[nodeId]['nodeObj'] 
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)
                       
            for c in range(self.nChains): 
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))                    
                x =  np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId) 
                    
                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1                      
                    y[rIndx] = 0                        
                elif sum( map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1        
                y = robjects.vectors.IntVector(y)
                
                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 0 
                allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) 
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 1
                    
                # call logistic regression using glmnet from Rpy
                fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0)
                nodeObj.fitRes.append(fit)
                    
                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit) 
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1
                    
                myparams = betaMatrix[:, j]
                if sum( myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))                    
                    myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0.  
                    
                nodeObj.params[c,:] =  myparams
                        
                        
    def getStimuliSpecificNet(self, stimulus):  
        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1',	 'Insulin',	'NRG1',	 'PBS',	 'Serum']
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0
        
        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:            
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes 
        # Edge indicates the impact 
        tmpNet = nx.DiGraph()
        for u,  v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) 
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception ("Bug in retrieving parameters of node v " + u)
                paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0
                    
                for ab in self.dictProteinToAntibody[u]: 
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) 
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception ("Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]
                    
#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
                    if vParams[uIndx+1] > 0. :
                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
                    elif vParams[uIndx+1]  < 0.:
                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
            
        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:                     
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)
                    
            if len(leafNodes) == 0:
                break
            
            for leaf in leafNodes:
                tmpNet.remove_node(leaf)
        
        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet
            
                         
                        
    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)
            
        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)
def calcTCI (mutcnaMatrixFN, degMatrixFN, tumorTypeFN = None, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0 = 0.3, 
             ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, PANCANFlag = None, rowBegin=0, rowEnd = None):
    """ 
    calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength)
    
    Calculate the causal scores between each pair of SGA and DEG observed in each tumor
    
    Inputs:
        mutcnaMatrixFN      A file containing a N x G binary matrix containing the mutation and CNA 
                            data of all tumors.  N is the number of tumors and 
                            G is number of total number of unique genes.  For a
                            tumor, genes that have SGAs are indicated by "1"s and "0" 
                            otherwise. 
                            Note the last 19 columns are indicators of the tumor 
        degMatrixFN         A file contains a N x G' binary matrix representing DEG
                            status.  A "1" indicate a gene is differentially expressed
                            in a tumor.
                            
        tumorTypeFN         A string of filename.  The file contains N x T matrix, in which
                            each row only has one element set to 1, rest to zero, as an indicator
                            which type of cancer each tumor belongs to 
        
        alphaIJList         A list of Dirichlet hyperparameters defining the prior
                            that a mutation event occurs
                            
        alphaIJKList        A list of Dirichlet hyperparameters for caulate the prior
                            of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0;
                            alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0;
                            alphaIJK[3]: mut == 1 && deg == 1
                            
        v0                  A float scalar indicate the prior probability that a DEG
                            is caused by a non-SGA factor 
                            
        PANCANFlag          A boolean flag to indicate if we are doing PANCAN

        ppiDict             A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary)

        dictGeneLength      A dictionary keeps the length of each of G genes in the 
                            mutcnaMatrix
	
	rowBegin, rowEnd       These two arguments control allow user to choose which block out of all tumors (defined by the two 
                			 row numbers) will be processes in by this function.  This can be used to process
                            mulitple block in a parallel fashion.
    """
    # check if gene length dictionary is set
    if not dictGeneLength :
        print "Gene length dictionary not provided, quit\n"
        sys.exit()
    
    # read in data in the form of NamedMatrix 
    try:
        mutcnaMatrix  = NamedMatrix(mutcnaMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % mutcnaMatrixFN
        sys.exit() 
        
    try:
        degMatrix = NamedMatrix(degMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % degMatrixFN
        sys.exit()
    
    mutGeneNames = mutcnaMatrix.getColnames()
    mutTumorNames = mutcnaMatrix.getRownames()
    degGeneNames = degMatrix.getColnames()
    exprsTumorNames = degMatrix.getRownames()
    
    #check if same tumor names from two matrices above agree
    if exprsTumorNames != mutTumorNames:
        print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!"
        print degMatrix.getRownames()
        print mutcnaMatrix.getRownames()
        sys.exit()

    tumorNames = exprsTumorNames
    nTumors, nMutGenes = mutcnaMatrix.shape()
    
    # now perform PANCAN analysis related tasks
    if PANCANFlag: 
        if not tumorTypeFN:
            print "Cannot perform PANCAN analysis without tumor-type-indicator matrix"
            sys.exit()
        try: 
            tumorTypeMatrix = NamedMatrix(tumorTypeFN)
        except:
            print "Failed to import tumor type file %s" % tumorTypeFN
            sys.exit()            
        tumorTypeTumorNames = [x.replace("\"", "") for x in tumorTypeMatrix.getRownames()]
        if exprsTumorNames != tumorTypeTumorNames:
            print "The tumors for tumorTypeMatrix and degMatrix do not fully overlap!"
            sys.exit()

        tumorTypes = tumorTypeMatrix.getColnames()    
        # Calculate the prior probability that a tumor-type variable may influence a DEG
        # to be proportional to the number of tumors from a given type
        vt = np.sum(tumorTypeMatrix.data, 0)  # perform a rowsum to count  each type tumor
        vtprior = np.divide(vt, float(nTumors)) # normalize to 1, as  prior for each type of tumor
        
    # Now start looping through a chunk of individual tumors and calculate the causal scores between each pair of SGA and DEG    
    print "Done with loading data, start processing tumor " + str(rowBegin)
    if not rowEnd:
        rowEnd = nTumors - 1
    else:
        if rowEnd >= nTumors:
		rowEnd = nTumors - 1
	elif rowEnd < rowBegin:
            print "Invalid rowEnd < rowBegin arguments given."
            sys.exit()

    if rowBegin > rowEnd:
        print "Invlid rowBegin > rowEnd argument given."
        sys.exit()

    for t in range(rowBegin, rowEnd):
        print "processign tumor  " + tumorNames[t]
        #print pacifier
        if t % 50 == 0:
            print "\nProcessed %s tumors" % str(t)
        
        # collect data related to DEGs to construct a submatrix containing only DEG of the tumor
        degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1]
        tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] 
        tumorDEGMatrix = degMatrix.data[:,degGeneIndx]
 
        # extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t
        tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1]        
        tumorMutGenes=  [mutGeneNames[i] for i in tumormutGeneIndx] 
        nTumorMutGenes = len(tumorMutGenes)

        # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t
        # check if special operations to create combinations of SGA events are needed.  If combination operation is needed, 
        # new combined muation matrix will be created                 
        if opFlag == OR:
            tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix)      
        else:  # default.  Extract columns of mutcnaMatrix corresponding to the altered genes          
            tumorMutMatrix = mutcnaMatrix.data[:,  tumormutGeneIndx]
        
        # Include the tumor-type label into the tumorMutMatrix as a tissue-specific 
        # fake Gt to capture the DEGs that has tissue-specific characterisitics 
        if PANCANFlag:
            tumorTypeLabelIndx = np.where(tumorTypeMatrix.data[t,:] == 1)[0]
            if len(tumorTypeLabelIndx) != 1:
                raise Exception("Fail to extract tumor type")  
            # add the label to the tumorMutGenes
            tumorMutMatrix = np.hstack((tumorMutMatrix, tumorTypeMatrix.data[:,tumorTypeLabelIndx]))                  
            tumorTypeName = tumorTypes[tumorTypeLabelIndx]        
            tumorMutGenes.append(tumorTypeName) 
            nTumorMutGenes = len(tumorMutGenes)
            
        # calculate single pairwise likelihood that an SGA causes a DEG.  Return a matrix where rows are mutGenes, 
        # columns are DEGs, currently without the joint impact
        tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix,  alphaIJKList)
                
        # If PANCAN analysis, construct combinations of tumor-type label with different GTs to determine the 
        # likelihood of DEG jointly conditioning on GT and tumor-type label.  This enables us to capture
        # the fact that a GT regulate a GE but they also have a high tendency in co-occurring in a specific tumor type            
        if PANCANFlag:  
            if opFlag == AND:
                raise Exception ("Combination of AND operation with PanCan analysis is not implemented")
                
            # Now, calcuate the log likelihood of joint impact of tumor label with individual GTs on each GE
            jointGTandTumorLableFScore = np.zeros((tumorMutMatrix.shape[1], tumorDEGMatrix.shape[1])) 
                
            # GT == 1 && Label == 1.  Use mulitplication as AND operation
            tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx])  
            tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix,  alphaIJKList)
            jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore)
            
            # GT == 1 && label == 0
            tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx]==0) 
            tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix,  alphaIJKList)
            jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore)

            # GT == 0 && label == 1
            tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx])
            tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix,  alphaIJKList)  
            jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore)
            
            # GT == 0 && label == 0
            tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx] == 0) 
            tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix,  alphaIJKList)  
            jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore)

            # stack the the joint loglikelihood matrix on top to the tumorLnFScore.  
            #Remove the tumor-type label variable from the matrix derived from tumorMutMatrix
            tumorLnFScore = np.vstack((jointGTandTumorLableFScore[:-1,:] , tumorLnFScore))             

        # Calculate the likelihood that A0, which is 1 for all tumors, as a cause for DEGs.  
        # Then, stack to the LnFScore, equivalent to adding a column of '1' to 
        # represent the A0 in tumorMutMatrix
        nullFscore = calcNullF(tumorDEGMatrix, alphaNull)
        tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) 

        # calcualte  log of the prior probability that any of mutated genes plus A0 can be a cause for a DEG.
        if PANCANFlag:
            if not opFlag:
                lntumorMutPriors = calcPanCanLnPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0)
            elif opFlag == AND:
                lntumorMutPriors = calcPanCanLnCombANDPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0)
            elif opFlag == OR:
                lntumorMutPriors = calcPanCanLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, vtprior[tumorTypeLabelIndx], v0)
        else:
            if not opFlag:
                lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0)  # a m-dimension vector with m being number of mutations
            else:
                if opFlag == AND:
                    lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0)
                elif opFlag == OR:
                    lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0)
                    
        # add to each column, note double transposes because  numpy broadcasts by row
        tumorLnFScore = np.add(tumorLnFScore.T, lntumorMutPriors).T  
               
        # calculate the normalizer for each column (GE).  
        colLogSum = calcColNormalizer(tumorLnFScore)       
        normalizer = np.tile(colLogSum, (tumorLnFScore.shape[0], 1))    
        posteriorAll = np.exp(add(tumorLnFScore, - normalizer))
        
        # now sum the posterior of each single GT with the posteriors of joint GT-Tumor-Type  
        posterior = np.add(posteriorAll[0:nTumorMutGenes-1, :], posteriorAll[nTumorMutGenes - 1:-2, :])
        posterior = np.vstack((posterior, posteriorAll[-2:, :]))        
        
        #write out the results 
        tumorMutGenes.append('A0')
        tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes)
        tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
Example #8
0
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0=0.2, dictGeneLength = None, outputPath = ".", opFlag = None):
    """ 
    calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength)
    
    Calculate the causal scores between each pair of SGA and DEG observed in each tumor
    
    Inputs:
        mutcnaMatrixFN      A file containing a N x G binary matrix containing the mutation and CNA 
                            data of all tumors.  N is the number of tumors and 
                            G is number of total number of unique genes.  For a
                            tumor, genes that have SGAs are indicated by "1"s and "0" 
                            otherwise. 
        degMatrixFN         A file contains a N x G' binary matrix representing DEG
                            status.  A "1" indicate a gene is differentially expressed
                            in a tumor.
        
        alphaIJList         A list of Dirichlet hyperparameters defining the prior
                            that a mutation event occurs
                            
        alphaIJKList        A list of Dirichlet hyperparameters for caulate the prior
                            of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0;
                            alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0;
                            alphaIJK[3]: mut == 1 && deg == 1
                            
        v0                  A float scalar indicate the prior probability that a DEG
                            is caused by a non-SGA factor 
        
        dictGeneLength      A dictionary keeps the length of each of G genes in the 
                            mutcnaMatrix
    """
    
    # read in data in the form of NamedMatrix 
    try:
        mutcnaMatrix  = NamedMatrix(mutcnaMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % mutcnaMatrixFN
        sys.exit() 
        
    try:
        degMatrix = NamedMatrix(degMatrixFN)
    except:
        print "Failed to import data matrix %s\n" % degMatrixFN
        sys.exit()
        
    if degMatrix.getRownames() != mutcnaMatrix.getRownames():
        print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!"
        sys.exit()
    
    if  not dictGeneLength :
        print "Gene length dictionary not provided, quit\n"
        sys.exit()
        
    # now we iterate through each tumor to infer the causal relationship between each 
    # pair of mut - deg
    tumorNames = degMatrix.getRownames()
    nTumors, nMutGenes = mutcnaMatrix.shape()
    
    mutGeneNames = mutcnaMatrix.getColnames()
    degGeneNames = degMatrix.getColnames()
    

    for t in range(nTumors):
        #print pacifier
        if t % 50 == 0:
            print "Processed %s tumors" % str(t)
        
        # collect data related to mutations
        tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1]
        nTumorMutGenes = len(tumormutGeneIndx)
        tumorMutGenes=  [mutGeneNames[i] for i in tumormutGeneIndx]        
      
        #now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t
        # stack a column of '1' to represent the A0.  If combination operation is needed, new combined muation matrix 
        # will be created         
        
        tumorMutMatrix = mutcnaMatrix.data[:,  tumormutGeneIndx]
        if opFlag:
            tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames)
            tumorNamedMatrix = createComb(tmpNamedMat, opFlag)
            if not tumorNamedMatrix:  # this tumor do not have any joint mutations that is oberved in 2% of all tumors
                continue
            tumorMutGenes = tumorNamedMatrix.colnames
            tumorMutMatrix = tumorNamedMatrix.data
            
        
        ## check operation options:  1) orginal, do nothing and contiue
        # otherwise creat combinary matrix using the tumorMutMatrix 
        # createCombMatrix(tumorMutMatrix, operationFlag)
        if not opFlag:
            lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0)  # a m-dimension vector with m being number of mutations
        else:
            lntumorMutPriors = calcLnCombPrior(tumorMutGenes, dictGeneLength, v0)
            
        tumorMutGenes.append('A0')
        
        # collect data related to DEGs
        degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1]
        tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx]
        nTumorDEGs = len(degGeneIndx)  # corresponding to n, the number of DEGs in a given tumor
        tumorDEGMatrix = degMatrix.data[:,degGeneIndx]
        
        # calculate pair-wise m x n matrix
        tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix,  alphaIJKList)
        nullFscore = calcNullF(tumorDEGMatrix, alphaNull)
        tumorLnFScore = np.vstack((tumorLnFScore, nullFscore))  #check out this later
               
        # calcualte the prior probability that any of mutated genes can be a cause for a DEG,
        # tile it up to make an nTumorMutGenes x nTumorDEG matrix
        tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T
        
        lnFScore = add(tumorLnFScore, tumorMutPriorMatrix)

#debug code below two lines        
        #tmpOut = NamedMatrix(npMatrix = lnFScore, colnames = tumorDEGGenes, rownames = tumorMutGenes)
        #tmpOut.writeToText(outputPath, filename = tumorNames[t] + "fscore.csv")
        
        
        # now we need to caclculate the normalized lnFScore so that each         
        columnAccumLogSum = np.zeros(nTumorDEGs)        
        for col in range(nTumorDEGs):
            currLogSum = np.NINF
            for j in range(lnFScore.shape[0]):
                if lnFScore[j,col] == np.NINF:
                    continue
                currLogSum = logSum(currLogSum, lnFScore[j,col])             
            columnAccumLogSum[col] = currLogSum
                
        normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1))      

        posterior = np.exp(add(lnFScore, - normalizer))
        
        #write out the results        
        tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes)     
        tumorPosterior.writeToText(outputPath, filename = tumorNames[t] + "-mut-vs-DEG-posterior.csv")