Python NamedMatrix.getValuesByCol Examples

Programming Language: Python

Namespace/Package Name: NamedMatrix

Class/Type: NamedMatrix

Method/Function: getValuesByCol

Examples at hotexamples.com: 2

Python NamedMatrix.getValuesByCol - 2 examples found. These are the top rated real world Python examples of NamedMatrix.NamedMatrix.getValuesByCol extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

writeToText(3)

NamedMatrix(2)

getColnames(1)

getRownames(1)

getValuesByCol(1)

shape(1)

Example #1

Show file

File: PyGibbCAMP.py Project: xlu29466/PyGibbCAMP

class PyGibbCAMP:
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self,
                 nodeFile,
                 dataMatrixFile,
                 perturbMatrix=None,
                 missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1

        self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
        #        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception(
                "Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s + 'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile

        if perturbMatrix:
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances

        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) == nCases
            if np.any(allMissing):
                raise Exception("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(
                lambda s: s + 'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines
                   ) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception("Failed to open the file containing nodes")
            return

        print "Creating network"
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')

            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein

            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein,
                                      nodeObj=SigNetNode(
                                          protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody,
                                  nodeObj=SigNetNode(antibody,
                                                     'PHOSPHORYLATIONSTATE',
                                                     False))
            self.network.add_node(fluo,
                                  nodeObj=SigNetNode(fluo, 'FLUORESCENCE',
                                                     True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)

        for perturb in perturbInstances:
            self.network.add_node(perturb,
                                  nodeObj=SigNetNode(perturb, 'PERTURBATION',
                                                     True))

        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)

    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a
    #  vector of real numbers.
    #
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network:
            self._initNodeParams(nodeId)

    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames(
            ):
                nodeData = self.obsData.getValuesByCol(nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(
                    nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k=2)
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c, :] = np.array(mixGaussians[2])
                nodeObj.sigmas[c, :] = np.array(mixGaussians[3])
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params = None

    ## Initialize latent variables
    #
    #
    def _initHiddenStates(self):
        hiddenNodes = [
            n for n in self.network
            if not self.network.node[n]['nodeObj'].bMeasured
        ]
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()

        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(
                NamedMatrix(npMatrix=tmp,
                            colnames=colnames,
                            rownames=caseNames))

            #initialize phos state based on the observed fluo
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:, nodeIndx] = tmp

                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames():
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(
                            node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp

    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and
    #  model parameters.
    def calcEvidenceLikelihood(self):
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data)
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack(
                    (np.ones(nCases),
                     self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (
                    1 + np.exp(-np.dot(predStates, nodeObj.params[c, :])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(
                    np.float).eps

                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))

            loglikelihood /= self.nChains
            return loglikelihood

    ## Perform graph search
    def trainGibbsEM(self,
                     nChains=10,
                     alpha=0.1,
                     nParents=4,
                     nSamples=5,
                     pickleDumpFile=None,
                     maxIter=1000):
        self.nChains = nChains
        self.alpha = alpha
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents

        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(
                self.alpha) + ".pickle"

        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId][
                    'nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames(
                    ):
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()

        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(
                np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(
            self.nChains) + "; nSamples = " + str(
                self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0

        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter):

            # E-step of EM
            self._updateActivationStates()
            if (nIter + 1) % 2 == 0:  # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] += self.nodeStates[c].data

            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:
                sampleCount = 0
                # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples,
                                          self.expectedStates)
                self._updteParams(self.alpha, nparents=self.nParents)

                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)
                print "nIter: " + str(
                    nIter +
                    1) + "; log likelihood of evidence: " + str(likelihood)

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except:
                        raise Exception("Cannot create pickle dumpfile " +
                                        self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break

                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(
                        np.shape(self.nodeStates[c].data))

        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self

    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False

        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml) / abs(ml)
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes.
        activationNode = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE'
        ]

        for nodeId in activationNode:
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)

                # sample states of current node based on the prob, and update
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState

                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(
                            condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState == 1,
                                                indx] = state

    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception(
                "Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents
            # calculate p(curNode = 1 | parents);
            nodeParams = nodeObj.params[c, :]
            predStates = np.column_stack(
                (np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
            pOneCondOnParents = 1 / (1 +
                                     np.exp(-np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0

        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)

                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c, :]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack(
                    (np.ones(nCases), childPredStates
                     ))  # padding data with a column ones as bias

                # Set the state of current node to ones
                curNodePosInPredList = childPreds.index(
                    nodeId) + 1  # offset by 1 because padding
                if childNodeParams[
                        curNodePosInPredList] == 0:  # not an real edge
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)
                pChildCondCurNodeOnes = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 1] -= np.finfo(
                    np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 0] += np.finfo(
                    np.float).eps
                logProbChildCondOne += np.log(curChildStates *
                                              pChildCondCurNodeOnes +
                                              (1 - curChildStates) *
                                              (1 - pChildCondCurNodeOnes))

                # set the state of the current node (nodeId) to zeros
                childPredStates[:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       1] -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       0] += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates *
                                                  pChildCondCurNodeZeros +
                                                  (1 - curChildStates) *
                                                  (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node
        curNodeMarginal = 1 / (
            1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros -
                       logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal

    def parseGlmnetCoef(self, glmnet_res):
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]

        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception(
                "'parse_glmnet_res' could not determine the dims of beta")
        nVariables, nLambda = map(int, dimStr.split(' x '))
        betaMatrix = np.zeros((nVariables, nLambda), dtype=np.float)

        # glmnet print beta matrix in mulitple blocks with
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = -blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))])
            if rowIndx == 1:
                curBlockColStart += blockSize

            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx,
                               curBlockColStart + j] = float(fields[j])

        return a0, betaMatrix

    def _updteParams(self, alpha=0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents

        for nodeId in self.network:
            nodeObj = self.network.node[nodeId]['nodeObj']
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)

            for c in range(self.nChains):
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))
                x = np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId)

                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)),
                            np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1
                    y[rIndx] = 0
                elif sum(map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1
                y = robjects.vectors.IntVector(y)

                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 0
                allZeros = np.where(
                    np.sum(np.ones(np.shape(x)) - x, 0) == nCases)
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 1

                # call logistic regression using glmnet from Rpy
                fit = glmnet(x, y, alpha=alpha, family="binomial", intercept=0)
                nodeObj.fitRes.append(fit)

                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit)
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1

                myparams = betaMatrix[:, j]
                if sum(myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))
                    myparams[
                        np.abs(myparams) < sortedParams[-self.nParents]] = 0.

                nodeObj.params[c, :] = myparams

    def getStimuliSpecificNet(self, stimulus):
        self.stimuli = [
            'EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'
        ]
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus +
                            "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli),
                              1) == 0

        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(
                    nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(
                    nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues),
                                       robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes
        # Edge indicates the impact
        tmpNet = nx.DiGraph()
        for u, v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0)
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception("Bug in retrieving parameters of node v " +
                                    u)
                paramZeros = np.sum(
                    self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx + 1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0

                for ab in self.dictProteinToAntibody[u]:
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params,
                                      0)
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception(
                            "Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]

#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])
                    if vParams[uIndx + 1] > 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="+",
                                        betaValue=vParams[uIndx + 1])
                    elif vParams[uIndx + 1] < 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="-",
                                        betaValue=vParams[uIndx + 1])

        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)

            if len(leafNodes) == 0:
                break

            for leaf in leafNodes:
                tmpNet.remove_node(leaf)

        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet

    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)

        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)

Example #2

Show file

File: PyGibbCAMP.py Project: akshayms/PyGibbCAMP

class PyGibbCAMP:  
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The 
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node 
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1
        
        self.dictPerturbEffect = {'AKT1' : [('GSK690693',	0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
#        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile
        
        if perturbMatrix:        
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances
                    
        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) ==  nCases
            if np.any(allMissing):
                raise Exception ("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception( "Failed to open the file containing nodes")
            return
            
        print "Creating network"          
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')
            
            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein
            
            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False))
            self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)
        
        for perturb in perturbInstances:
            self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True))                
            
        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers 
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)
            
        
    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.  
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a 
    #  vector of real numbers.
    # 
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network: 
            self._initNodeParams(nodeId)
            
    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':                
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames():
                nodeData = self.obsData.getValuesByCol( nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):   
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 )
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c,:] = np.array(mixGaussians[2])
                nodeObj.sigmas[c,:] = np.array(mixGaussians[3])            
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params  = None
                
    
    ## Initialize latent variables
    #    
    #
    def _initHiddenStates(self):
        hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured]
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()
        
        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames))
            
            #initialize phos state based on the observed fluo 
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])                    
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:,nodeIndx] = tmp
                
                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames(): 
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp
                    
        
        
    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and 
    #  model parameters.            
    def calcEvidenceLikelihood(self):
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data) 
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps
                
                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))
                
            loglikelihood /= self.nChains
            return loglikelihood
        
    ## Perform graph search
    def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000):
        self.nChains = nChains
        self.alpha = alpha  
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents
        
        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) +  ".pickle"  
        
        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames():
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge        
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()        
        
        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):                  
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0
        
        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0"  + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter): 
                
            # E-step of EM
            self._updateActivationStates()            
            if  (nIter+1) % 2 == 0: # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] +=  self.nodeStates[c].data                
                
            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:                    
                sampleCount = 0
                 # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates)
                self._updteParams(self.alpha, nparents = self.nParents)
                
                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)   
                print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood)                    

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except: 
                        raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break
                
                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data))
                
        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self  
            
    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False
            
        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml ) / abs(ml)        
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes. 
        activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE']
                    
        for nodeId in activationNode: 
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)
                
                # sample states of current node based on the prob, and update 
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState
                
                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState==1, indx] = state
                        
            
    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)        

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)        
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents  
            # calculate p(curNode = 1 | parents);                 
            nodeParams = nodeObj.params[c,:] 
            predStates =  np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 
            pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents  = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children 
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0
        
        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:  
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)                    
                
                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c,:]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias

                # Set the state of current node to ones 
                curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding 
                if childNodeParams[curNodePosInPredList] == 0:  # not an real edge 
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)                
                pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps
                logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes))
                    
                # set the state of the current node (nodeId) to zeros 
                childPredStates [:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) 
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==1]  -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==0]  += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node 
        curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal
    

    def parseGlmnetCoef(self, glmnet_res):        
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]
        
        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception("'parse_glmnet_res' could not determine the dims of beta")
        nVariables , nLambda = map(int, dimStr.split(' x ')) 
        betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float)
        
        # glmnet print beta matrix in mulitple blocks with 
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = - blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))]) 
            if rowIndx == 1:
                curBlockColStart += blockSize
                
            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j])                 
                            
        return a0, betaMatrix       
      
        
    def _updteParams(self, alpha = 0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents
        
        for nodeId in self.network:     
            nodeObj = self.network.node[nodeId]['nodeObj'] 
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)
                       
            for c in range(self.nChains): 
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))                    
                x =  np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId) 
                    
                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1                      
                    y[rIndx] = 0                        
                elif sum( map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1        
                y = robjects.vectors.IntVector(y)
                
                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 0 
                allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) 
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 1
                    
                # call logistic regression using glmnet from Rpy
                fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0)
                nodeObj.fitRes.append(fit)
                    
                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit) 
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1
                    
                myparams = betaMatrix[:, j]
                if sum( myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))                    
                    myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0.  
                    
                nodeObj.params[c,:] =  myparams
                        
                        
    def getStimuliSpecificNet(self, stimulus):  
        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1',	 'Insulin',	'NRG1',	 'PBS',	 'Serum']
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0
        
        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:            
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes 
        # Edge indicates the impact 
        tmpNet = nx.DiGraph()
        for u,  v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) 
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception ("Bug in retrieving parameters of node v " + u)
                paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0
                    
                for ab in self.dictProteinToAntibody[u]: 
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) 
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception ("Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]
                    
#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
                    if vParams[uIndx+1] > 0. :
                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
                    elif vParams[uIndx+1]  < 0.:
                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
            
        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:                     
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)
                    
            if len(leafNodes) == 0:
                break
            
            for leaf in leafNodes:
                tmpNet.remove_node(leaf)
        
        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet
            
                         
                        
    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)
            
        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)