Python NamedMatrix.getValuesByCol 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: NamedMatrix

클래스/타입: NamedMatrix

메소드/함수: getValuesByCol

hotexamples.com에서의 예제들: 2

Python NamedMatrix.getValuesByCol - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 NamedMatrix.NamedMatrix.getValuesByCol에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

writeToText(3)

NamedMatrix(2)

getColnames(1)

getRownames(1)

getValuesByCol(1)

shape(1)

예제 #1

파일 보기

파일: PyGibbCAMP.py 프로젝트: xlu29466/PyGibbCAMP

class PyGibbCAMP:
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self,
                 nodeFile,
                 dataMatrixFile,
                 perturbMatrix=None,
                 missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1

        self.dictPerturbEffect = {'AKT1' : [('GSK690693', 0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
        #        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception(
                "Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s + 'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile

        if perturbMatrix:
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances

        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) == nCases
            if np.any(allMissing):
                raise Exception("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(
                lambda s: s + 'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines
                   ) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception("Failed to open the file containing nodes")
            return

        print "Creating network"
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')

            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein

            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein,
                                      nodeObj=SigNetNode(
                                          protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody,
                                  nodeObj=SigNetNode(antibody,
                                                     'PHOSPHORYLATIONSTATE',
                                                     False))
            self.network.add_node(fluo,
                                  nodeObj=SigNetNode(fluo, 'FLUORESCENCE',
                                                     True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)

        for perturb in perturbInstances:
            self.network.add_node(perturb,
                                  nodeObj=SigNetNode(perturb, 'PERTURBATION',
                                                     True))

        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)

    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a
    #  vector of real numbers.
    #
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network:
            self._initNodeParams(nodeId)

    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames(
            ):
                nodeData = self.obsData.getValuesByCol(nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(
                    nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k=2)
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c, :] = np.array(mixGaussians[2])
                nodeObj.sigmas[c, :] = np.array(mixGaussians[3])
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params = None

    ## Initialize latent variables
    #
    #
    def _initHiddenStates(self):
        hiddenNodes = [
            n for n in self.network
            if not self.network.node[n]['nodeObj'].bMeasured
        ]
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()

        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(
                NamedMatrix(npMatrix=tmp,
                            colnames=colnames,
                            rownames=caseNames))

            #initialize phos state based on the observed fluo
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:, nodeIndx] = tmp

                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames():
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(
                            node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp

    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and
    #  model parameters.
    def calcEvidenceLikelihood(self):
        phosNodes = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'
        ]
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data)
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack(
                    (np.ones(nCases),
                     self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (
                    1 + np.exp(-np.dot(predStates, nodeObj.params[c, :])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(
                    np.float).eps

                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))

            loglikelihood /= self.nChains
            return loglikelihood

    ## Perform graph search
    def trainGibbsEM(self,
                     nChains=10,
                     alpha=0.1,
                     nParents=4,
                     nSamples=5,
                     pickleDumpFile=None,
                     maxIter=1000):
        self.nChains = nChains
        self.alpha = alpha
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents

        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(
                self.alpha) + ".pickle"

        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId][
                    'nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames(
                    ):
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()

        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(
                np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(
            self.nChains) + "; nSamples = " + str(
                self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0

        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0" + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter):

            # E-step of EM
            self._updateActivationStates()
            if (nIter + 1) % 2 == 0:  # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] += self.nodeStates[c].data

            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:
                sampleCount = 0
                # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples,
                                          self.expectedStates)
                self._updteParams(self.alpha, nparents=self.nParents)

                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)
                print "nIter: " + str(
                    nIter +
                    1) + "; log likelihood of evidence: " + str(likelihood)

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except:
                        raise Exception("Cannot create pickle dumpfile " +
                                        self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break

                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(
                        np.shape(self.nodeStates[c].data))

        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self

    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False

        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml) / abs(ml)
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes.
        activationNode = [
            n for n in self.network
            if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE'
        ]

        for nodeId in activationNode:
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)

                # sample states of current node based on the prob, and update
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState

                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(
                            condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState == 1,
                                                indx] = state

    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception(
                "Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents
            # calculate p(curNode = 1 | parents);
            nodeParams = nodeObj.params[c, :]
            predStates = np.column_stack(
                (np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
            pOneCondOnParents = 1 / (1 +
                                     np.exp(-np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0

        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)

                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c, :]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack(
                    (np.ones(nCases), childPredStates
                     ))  # padding data with a column ones as bias

                # Set the state of current node to ones
                curNodePosInPredList = childPreds.index(
                    nodeId) + 1  # offset by 1 because padding
                if childNodeParams[
                        curNodePosInPredList] == 0:  # not an real edge
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)
                pChildCondCurNodeOnes = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 1] -= np.finfo(
                    np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes == 0] += np.finfo(
                    np.float).eps
                logProbChildCondOne += np.log(curChildStates *
                                              pChildCondCurNodeOnes +
                                              (1 - curChildStates) *
                                              (1 - pChildCondCurNodeOnes))

                # set the state of the current node (nodeId) to zeros
                childPredStates[:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (
                    1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       1] -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros ==
                                       0] += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates *
                                                  pChildCondCurNodeZeros +
                                                  (1 - curChildStates) *
                                                  (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node
        curNodeMarginal = 1 / (
            1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros -
                       logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal

    def parseGlmnetCoef(self, glmnet_res):
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]

        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception(
                "'parse_glmnet_res' could not determine the dims of beta")
        nVariables, nLambda = map(int, dimStr.split(' x '))
        betaMatrix = np.zeros((nVariables, nLambda), dtype=np.float)

        # glmnet print beta matrix in mulitple blocks with
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = -blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))])
            if rowIndx == 1:
                curBlockColStart += blockSize

            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx,
                               curBlockColStart + j] = float(fields[j])

        return a0, betaMatrix

    def _updteParams(self, alpha=0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents

        for nodeId in self.network:
            nodeObj = self.network.node[nodeId]['nodeObj']
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)

            for c in range(self.nChains):
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))
                x = np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId)

                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)),
                            np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1
                    y[rIndx] = 0
                elif sum(map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1
                y = robjects.vectors.IntVector(y)

                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 0
                allZeros = np.where(
                    np.sum(np.ones(np.shape(x)) - x, 0) == nCases)
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)),
                                np.random.rand(3) * nCases)
                    x[rIndx, col] = 1

                # call logistic regression using glmnet from Rpy
                fit = glmnet(x, y, alpha=alpha, family="binomial", intercept=0)
                nodeObj.fitRes.append(fit)

                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit)
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1

                myparams = betaMatrix[:, j]
                if sum(myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))
                    myparams[
                        np.abs(myparams) < sortedParams[-self.nParents]] = 0.

                nodeObj.params[c, :] = myparams

    def getStimuliSpecificNet(self, stimulus):
        self.stimuli = [
            'EGF', 'FGF1', 'HGF', 'IGF1', 'Insulin', 'NRG1', 'PBS', 'Serum'
        ]
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus +
                            "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli),
                              1) == 0

        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(
                    nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(
                    nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues),
                                       robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes
        # Edge indicates the impact
        tmpNet = nx.DiGraph()
        for u, v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0)
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception("Bug in retrieving parameters of node v " +
                                    u)
                paramZeros = np.sum(
                    self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx + 1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0

                for ab in self.dictProteinToAntibody[u]:
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params,
                                      0)
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception(
                            "Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]

#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])
                    if vParams[uIndx + 1] > 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="+",
                                        betaValue=vParams[uIndx + 1])
                    elif vParams[uIndx + 1] < 0.:
                        tmpNet.add_edge(ab,
                                        v,
                                        effect="-",
                                        betaValue=vParams[uIndx + 1])

        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)

            if len(leafNodes) == 0:
                break

            for leaf in leafNodes:
                tmpNet.remove_node(leaf)

        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet

    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)

        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)

예제 #2

파일 보기

파일: PyGibbCAMP.py 프로젝트: akshayms/PyGibbCAMP

class PyGibbCAMP:  
    ## Constructor
    #  @param nodeFile  A string of pathname of file containing nodes.  The 
    #                   name, type, measured
    #  @param edgeFile  A list of tuples, each containing a source and sink node 
    #                   of an edge
    #  @param dataMatrixFile  A string to data
    def __init__(self, nodeFile , dataMatrixFile , perturbMatrix = None, missingDataMatrix=None):
        self.network = None
        self.obsData = None
        self.missingDataMatrix = None
        perturbInstances = None
        self.nChains = 1
        
        self.dictPerturbEffect = {'AKT1' : [('GSK690693',	0), \
        ('GSK690693_GSK1120212', 0)], 'MAP2K1' : [('GSK690693_GSK1120212', 0)],\
        'EGFR': [('EGF' , 1), ('FGF1', 1)]}
#        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1', 'Insulin',	'NRG1',	'PBS',	'Serum']

        # parse data mastrix by calling NamedMatrix class
        if not dataMatrixFile:
            raise Exception("Cannot create PyCAMP obj without 'dataMatrixFile'")
            return
        self.obsData = NamedMatrix(dataMatrixFile)
        nCases, nAntibodies = np.shape(self.obsData.data)
        self.obsData.colnames = map(lambda s: s+'F', self.obsData.colnames)
        self.obsDataFileName = dataMatrixFile
        
        if perturbMatrix:        
            self.perturbData = NamedMatrix(perturbMatrix)
            perturbInstances = self.perturbData.getColnames()
            self.perturbInstances = perturbInstances
                    
        if missingDataMatrix:
            self.missingDataMatrix = NamedMatrix(missingDataMatrix)
            allMissing = np.sum(self.missingDataMatrix, 0) ==  nCases
            if np.any(allMissing):
                raise Exception ("Data matrix contain data-less columns")
            self.missingDataMatrix.colnames = map(lambda s: s+'F', self.missingDataMatrix.colnames)

        if not nodeFile:
            raise Exception("Calling 'intiNetwork' with empty nodeFile name")
            return

        try:
            nf = open(nodeFile, "r")
            nodeLines = nf.readlines()
            if len(nodeLines) == 1:  # Mac files end a line with \r instead of \n
                nodeLines = nodeLines[0].split("\r")
            nf.close()
        except IOError:
            raise Exception( "Failed to open the file containing nodes")
            return
            
        print "Creating network"          
        self.network = nx.DiGraph()

        self.dictProteinToAntibody = dict()
        self.dictAntibodyToProtein = dict()
        # parse nodes
        for line in nodeLines:
            #print line
            protein, antibody = line.rstrip().split(',')
            
            if protein not in self.dictProteinToAntibody:
                self.dictProteinToAntibody[protein] = []
            self.dictProteinToAntibody[protein].append(antibody)
            self.dictAntibodyToProtein[antibody] = protein
            
            fluo = antibody + 'F'
            if protein not in self.network:
                self.network.add_node(protein, nodeObj = SigNetNode(protein, 'ACTIVATIONSTATE', False))
            self.network.add_node(antibody, nodeObj= SigNetNode(antibody, 'PHOSPHORYLATIONSTATE', False))
            self.network.add_node(fluo, nodeObj = SigNetNode(fluo, 'FLUORESCENCE', True))
            self.network.add_edge(antibody, protein)
            self.network.add_edge(antibody, fluo)
        
        for perturb in perturbInstances:
            self.network.add_node(perturb, nodeObj = SigNetNode(perturb, 'PERTURBATION', True))                
            
        # Add edges between PERTURBATION, protein activity,and  phosphorylation layers 
        for pro in self.dictProteinToAntibody:
            for phos in self.dictAntibodyToProtein:
                if self.dictAntibodyToProtein[phos] == pro:
                    continue
                self.network.add_edge(pro, phos)
            for perturb in perturbInstances:
                self.network.add_edge(perturb, pro)
            
        
    ## Init parameters of the model
    #  In Bayesian network setting, the joint probability is calculated
    #  through the product of a series conditional probability.  The parameters
    #  of the PyCAMP model defines p(x | Pa(X)).  For observed fluorescent node
    #  the conditional probability is a mixture of two Gaussian distribution.  
    #  therefore, the parameters are two pairs of mu and sigma.  For
    #  the hidden variables representing phosphorylation states and activation
    #  states of proteins, the conditional probability is defined by a logistic
    #  regression. Therefore, the parameters associated with such a node is a 
    #  vector of real numbers.
    # 
    def _initParams(self):
        print "Initialize parameters associated with each node in each MCMC chain"
        for nodeId in self.network: 
            self._initNodeParams(nodeId)
            
    def _initNodeParams(self, nodeId):
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.type == 'FLUORESCENCE':                
            # Estimate mean and sd of fluo signal using mixture model
            if self.missingDataMatrix and nodeId in self.missingDataMatrix.getColnames():
                nodeData = self.obsData.getValuesByCol( nodeId)
                nodeData = nodeData[self.missingDataMatrix.getValuesByCol(nodeId) == 0]
            else:
                nodeData = self.obsData.getValuesByCol(nodeId)
            nodeObj.mus = np.zeros((self.nChains, 2))
            nodeObj.sigmas = np.zeros((self.nChains, 2))
            for c in range(self.nChains):   
                mixGaussians = normalmixEM(robjects.FloatVector(nodeData), k = 2 )
                # mus and sigmas are represented as nChain x 2 matrices
                nodeObj.mus[c,:] = np.array(mixGaussians[2])
                nodeObj.sigmas[c,:] = np.array(mixGaussians[3])            
        else:
            preds = self.network.predecessors(nodeId)
            if len(preds) > 0:
                nodeObj.paramNames = preds
                nodeObj.params = np.random.randn(self.nChains, len(preds) + 1)
            else:
                nodeObj.params  = None
                
    
    ## Initialize latent variables
    #    
    #
    def _initHiddenStates(self):
        hiddenNodes = [n for n in self.network if not self.network.node[n]['nodeObj'].bMeasured]
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        #print str(phosNodes)
        nCases, nAntibody = self.obsData.shape()
        caseNames = self.obsData.getRownames()
        
        self.nodeStates = list()
        for c in range(self.nChains):
            tmp = np.zeros((nCases, len(hiddenNodes)))
            tmp[np.random.rand(nCases, len(hiddenNodes)) < 0.3] = 1
            tmp = np.column_stack((tmp, self.perturbData.data))
            colnames = hiddenNodes + self.perturbData.colnames
            self.nodeStates.append(NamedMatrix(npMatrix = tmp, colnames = colnames, rownames = caseNames))
            
            #initialize phos state based on the observed fluo 
            for node in phosNodes:
                fluoNode = node + 'F'
                #print "phosNode:" + node + "; fluoNode: " + fluoNode
                fluoNodeObj = self.network.node[fluoNode]['nodeObj']
                fluoData = self.obsData.getValuesByCol(fluoNode)
                tmp = np.zeros(nCases)
                phosProbOne = - np.log(fluoNodeObj.sigmas[c, 1])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 1]) / np.square(fluoNodeObj.sigmas[c, 1])                    
                phosProbZero = - np.log(fluoNodeObj.sigmas[c, 0])\
                - 0.5 * np.square(fluoData - fluoNodeObj.mus[c, 0]) / np.square(fluoNodeObj.sigmas[c, 0])
                tmp[phosProbOne > phosProbZero] = 1
                nodeIndx = self.nodeStates[c].findColIndices(node)
                self.nodeStates[c].data[:,nodeIndx] = tmp
                
                # take care of missing values by random sampling
                if self.missingDataMatrix:
                    if node in self.missingDataMatrix.getColnames(): 
                        #print "processing node with missing values: " + nodeId
                        missingCases = self.missingDataMatrix.getValuesByCol(node) == 1
                        tmp = np.zeros(sum(missingCases))
                        tmp[np.random.rand(len(tmp)) <= 0.3] = 1
                        self.nodeStates[c].data[missingCases, nodeIndx] = tmp
                    
        
        
    ## Calculate the marginal probability of observing the measured data by
    #  integrating out all possible setting of latent variable states and 
    #  model parameters.            
    def calcEvidenceLikelihood(self):
        phosNodes = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'PHOSPHORYLATIONSTATE']
        loglikelihood = 0
        nCases, nAntibodies = np.shape(self.obsData.data) 
        for nodeId in phosNodes:
            nodeObj = self.network.node[nodeId]['nodeObj']
            nodeIndx = self.nodeStates[0].findColIndices(nodeId)
            preds = self.network.predecessors(nodeId)
            for c in range(self.nChains):
                nodeData = self.nodeStates[c].data[:, nodeIndx]
                predStates = np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds)))
                pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeObj.params[c,:])))
                pOneCondOnParents[pOneCondOnParents == 1.] -= np.finfo(np.float).eps
                
                loglikelihood += np.sum(nodeData * np.log(pOneCondOnParents) \
                + (1 - nodeData) * np.log(1 - pOneCondOnParents))
                
            loglikelihood /= self.nChains
            return loglikelihood
        
    ## Perform graph search
    def trainGibbsEM(self, nChains = 10, alpha = 0.1, nParents = 4, nSamples = 5, pickleDumpFile = None, maxIter = 1000):
        self.nChains = nChains
        self.alpha = alpha  
        self.likelihood = list()
        self.nSamples = nSamples
        self.nParents = nParents
        
        if pickleDumpFile:
            self.pickleDumpFile = pickleDumpFile
        else:
            self.pickleDumpFile = self.obsDataFileName + "alpha" + str(self.alpha) +  ".pickle"  
        
        # check if the network and data agrees
        nodeToDelete = list()
        for nodeId in self.network:
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' and nodeId not in self.obsData.getColnames():
                print "Node " + nodeId + " don't has associated data"
                nodeToDelete.append(nodeId)
                nodeToDelete.append(self.network.predecessors(nodeId)[0])
        for nodeId in nodeToDelete:
            if self.network.has_node(nodeId):
                print "removing node " + nodeId
                self.network.remove_node(nodeId)

        # Starting EM set up Markov chains  to train a model purely based on prior knowledge        
        self._initParams()
        self._initHiddenStates()

        # perform update of latent variables in a layer-wise manner
        self.likelihood = list()        
        
        self.expectedStates = list()
        nCases, nAntibodies = np.shape(self.obsData.data)
        for c in range(self.nChains):                  
            # each chain collect expected statistics of nodes from samples along the chain
            self.expectedStates.append(np.zeros(np.shape(self.nodeStates[c].data)))

        print "Starting EM: alpha = " + str(self.alpha) + "; nChains = " + str(self.nChains) + "; nSamples = " + str (self.nSamples) + "; nParents = " + str(self.nParents)
        optLikelihood = float("-inf")
        bConverged = False
        sampleCount = 0
        
        likelihood = self.calcEvidenceLikelihood()
        print "nIter: 0"  + "; log likelihood of evidence: " + str(likelihood)
        self.likelihood.append(likelihood)
        for nIter in range(maxIter): 
                
            # E-step of EM
            self._updateActivationStates()            
            if  (nIter+1) % 2 == 0: # we collect sample every other iteration
                sampleCount += 1
                for c in range(self.nChains):
                    self.expectedStates[c] +=  self.nodeStates[c].data                
                
            # M-step of EM.  We only update parameters after a collecting a certain number of samples
            if sampleCount >= self.nSamples:                    
                sampleCount = 0
                 # take expectation of sample states
                self.expectedStates = map(lambda x: x / self.nSamples, self.expectedStates)
                self._updteParams(self.alpha, nparents = self.nParents)
                
                likelihood = self.calcEvidenceLikelihood()
                self.likelihood.append(likelihood)   
                print "nIter: " + str(nIter + 1) + "; log likelihood of evidence: " + str(likelihood)                    

                # collect the current best fit models
                if likelihood > optLikelihood:
                    optLikelihood = likelihood
                    try:
                        cPickle.dump(self, open(self.pickleDumpFile, 'wb'))
                    except: 
                        raise Exception("Cannot create pickle dumpfile " + self.pickleDumpFile)

                bConverged = self._checkConvergence()
                if bConverged:
                    print "EM converged!"
                    break
                
                for c in range(self.nChains):  # clear expectedStates
                    self.expectedStates[c] = np.zeros(np.shape(self.nodeStates[c].data))
                
        # now try to delete edges that does contribute to evidence
        #self.trimEdgeByConsensus(.9)
        return self  
            
    def _checkConvergence(self):
        # To do, add convergence checking code
        if len(self.likelihood) < 20:
            return False
            
        ml = np.mean(self.likelihood[-5:-1])
        ratio = abs(self.likelihood[-1] - ml ) / abs(ml)        
        return ratio <= 0.001

    def _updateActivationStates(self):
        nCases, antibody = np.shape(self.obsData.data)
        nCases, nHiddenNodes = np.shape(self.nodeStates[0].data)

        # interate through all nodes. 
        activationNode = [n for n in self.network if self.network.node[n]['nodeObj'].type == 'ACTIVATIONSTATE']
                    
        for nodeId in activationNode: 
            for c in range(self.nChains):
                curNodeMarginal = self.calcNodeCondProb(nodeId, c)
                
                # sample states of current node based on the prob, and update 
                sampleState = np.zeros(nCases)
                sampleState[curNodeMarginal >= np.random.rand(nCases)] = 1.
                curNodeIndx = self.nodeStates[c].findColIndices(nodeId)
                self.nodeStates[c].data[:, curNodeIndx] = sampleState
                
                # clamp the activationState of perturbed nodes to a fix value
                if nodeId in self.dictPerturbEffect:
                    # the diction keeps a list conditins under which the node is perurbed and the state to be clamped to
                    for condition, state in self.dictPerturbEffect[nodeId]:
                        perturbState = self.nodeStates[c].getValuesByCol(condition)
                        indx = self.nodeStates[c].findColIndices(nodeId)
                        self.nodeStates[c].data[perturbState==1, indx] = state
                        
            
    def calcNodeCondProb(self, nodeId, c):
        """
        Calculate the marginal probability of a node's state set to "1" conditioning 
        on all evidence.
        
        args:
             nodeId   A string id of the node of interest
             c        An integer indicate the chain from which the parameter 
                         vector to be used  
        """
        nodeObj = self.network.node[nodeId]['nodeObj']
        if nodeObj.bMeasured:
            raise Exception("Call _caclNodeMarginalProb on an observed variable " + nodeId)

        nCases, nAntibody = np.shape(self.obsData.data)        

        # collect the state of the predecessors of the node
        preds = self.network.predecessors(nodeId)        
        logProbOneCondOnParents = 0
        logProbZeroCondOnParents = 0
        if len(preds) > 0:  # if the node has parents  
            # calculate p(curNode = 1 | parents);                 
            nodeParams = nodeObj.params[c,:] 
            predStates =  np.column_stack((np.ones(nCases), self.nodeStates[c].getValuesByCol(preds))) 
            pOneCondOnParents = 1 / (1 + np.exp( - np.dot(predStates, nodeParams)))
            pOneCondOnParents[pOneCondOnParents == 1] -= np.finfo(np.float).eps
            pOneCondOnParents[pOneCondOnParents == 0] += np.finfo(np.float).eps
            logProbOneCondOnParents  = np.log(pOneCondOnParents)
            logProbZeroCondOnParents = np.log(1 - pOneCondOnParents)

        # collect  evidence from  children 
        logProbChildCondOne = 0  # the prob of child conditioning on current node == 1
        logProdOfChildCondZeros = 0
        
        children = self.network.successors(nodeId)
        if len(children) > 0:
            for child in children:  
                childNodeObj = self.network.node[child]['nodeObj']
                curChildStates = self.nodeStates[c].getValuesByCol(child)                    
                
                # Collect states of the predecessors of the child
                childPreds = self.network.predecessors(child)
                childNodeParams = childNodeObj.params[c,:]
                childPredStates = self.nodeStates[c].getValuesByCol(childPreds)
                childPredStates = np.column_stack((np.ones(nCases), childPredStates)) # padding data with a column ones as bias

                # Set the state of current node to ones 
                curNodePosInPredList = childPreds.index(nodeId) + 1 # offset by 1 because padding 
                if childNodeParams[curNodePosInPredList] == 0:  # not an real edge 
                    continue
                childPredStates[:, curNodePosInPredList] = np.ones(nCases)                
                pChildCondCurNodeOnes = 1 / (1 + np.exp(-np.dot(childPredStates, childNodeParams)))
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==1] -= np.finfo(np.float).eps
                pChildCondCurNodeOnes[pChildCondCurNodeOnes==0] += np.finfo(np.float).eps
                logProbChildCondOne += np.log (curChildStates * pChildCondCurNodeOnes + (1 - curChildStates) * (1 - pChildCondCurNodeOnes))
                    
                # set the state of the current node (nodeId) to zeros 
                childPredStates [:, curNodePosInPredList] = np.zeros(nCases)
                pChildCondCurNodeZeros = 1 / (1 + np.exp(- np.dot(childPredStates, childNodeParams))) 
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==1]  -= np.finfo(np.float).eps
                pChildCondCurNodeZeros[pChildCondCurNodeZeros==0]  += np.finfo(np.float).eps
                logProdOfChildCondZeros += np.log(curChildStates * pChildCondCurNodeZeros + (1 - curChildStates) * (1 - pChildCondCurNodeZeros))

        # now we can calculate the marginal probability of current node 
        curNodeMarginal = 1 / (1 + np.exp(logProbZeroCondOnParents + logProdOfChildCondZeros - logProbOneCondOnParents - logProbChildCondOne))
        return curNodeMarginal
    

    def parseGlmnetCoef(self, glmnet_res):        
        """ Parse the 'beta' matrix returned by calling glmnet through RPy2.
            Return the first column of 'beta' matrix of the glmnet object 
            with 3 or more non-zero values 
            """
        # read in intercept; a vector of length of nLambda
        a0 = np.array(glmnet_res.rx('a0'))[0]
        
        # Read in lines of beta matrix txt, which is a nVariables * nLambda.
        # Since we call glmnet by padding x with a column of 1s, we only work
        # with the 'beta' matrix returned by fit
        betaLines = StringIO(str(glmnet_res.rx('beta'))).readlines()
        dimStr = re.search("\d+\s+x\s+\d+", betaLines[1]).group(0)
        if not dimStr:
            raise Exception("'parse_glmnet_res' could not determine the dims of beta")
        nVariables , nLambda = map(int, dimStr.split(' x ')) 
        betaMatrix = np.zeros( (nVariables, nLambda), dtype=np.float)
        
        # glmnet print beta matrix in mulitple blocks with 
        # nVariable * blockSize
        blockSize = len(betaLines[4].split()) - 1
        curBlockColStart = - blockSize
        for line in betaLines:  #read in blocks
            m = re.search('^V\d+', line)
            if not m:  # only find the lines begins with 'V\d'
                continue
            else:
                rowIndx = int(m.group(0)[1:len(m.group(0))]) 
            if rowIndx == 1:
                curBlockColStart += blockSize
                
            # set 'rowIndx' as start from 0
            rowIndx -= 1

            fields = line.rstrip().split()
            fields.pop(0)
            if len(fields) != blockSize:
                blockSize = len(fields)
            for j in range(blockSize):
                if fields[j] == '.':
                    continue
                else:
                    betaMatrix[rowIndx, curBlockColStart + j] = float(fields[j])                 
                            
        return a0, betaMatrix       
      
        
    def _updteParams(self, alpha = 0.1, nparents=None):
        # Update the parameter associated with each node, p(n | Pa(n)) using logistic regression,
        # using expected states of precessors as X and current node states acrss samples as y
        nCases, nVariables = np.shape(self.obsData.data)
        if not nparents:
            nparents = self.nParents
        
        for nodeId in self.network:     
            nodeObj = self.network.node[nodeId]['nodeObj'] 
            if nodeObj.type == 'FLUORESCENCE' or nodeObj.type == 'PERTURBATION':
                continue
            nodeObj.fitRes = list()
            preds = self.network.predecessors(nodeId)
            predIndices = self.nodeStates[0].findColIndices(preds)
                       
            for c in range(self.nChains): 
                expectedPredState = self.expectedStates[c][:, predIndices]
                #x = np.column_stack((np.ones(nCases), expectedPredState))                    
                x =  np.column_stack((np.ones(nCases), expectedPredState))
                y = self.nodeStates[c].getValuesByCol(nodeId) 
                    
                #check if all x and y are of same value, which will lead to problem for glmnet
                rIndx = map(lambda z: int(math.floor(z)), np.random.rand(50) * nCases)
                if sum(y) == nCases:  # if every y == 1                      
                    y[rIndx] = 0                        
                elif sum( map(lambda x: 1 - x, y)) == nCases:
                    y[rIndx] = 1        
                y = robjects.vectors.IntVector(y)
                
                allRwoSumOnes = np.where(np.sum(x, 0) == nCases)[0]
                for col in allRwoSumOnes:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 0 
                allZeros = np.where(np.sum(np.ones(np.shape(x)) - x, 0) == nCases) 
                for col in allZeros[0]:
                    rIndx = map(lambda z: int(math.floor(z)), np.random.rand(3) * nCases)
                    x[rIndx, col] = 1
                    
                # call logistic regression using glmnet from Rpy
                fit = glmnet (x, y, alpha = alpha, family = "binomial", intercept = 0)
                nodeObj.fitRes.append(fit)
                    
                # extract coefficients glmnet, keep the first set beta with nParent non-zeros values
                a0, betaMatrix = self.parseGlmnetCoef(fit) 
                for j in range(np.shape(betaMatrix)[1]):
                    if sum(betaMatrix[:, j] != 0.) >= nparents:
                        break
                if j >= len(a0):
                    j = len(a0) - 1
                    
                myparams = betaMatrix[:, j]
                if sum( myparams != 0.) > nparents:
                    sortedParams = sorted(np.abs(myparams))                    
                    myparams[np.abs(myparams) < sortedParams[-self.nParents]] = 0.  
                    
                nodeObj.params[c,:] =  myparams
                        
                        
    def getStimuliSpecificNet(self, stimulus):  
        self.stimuli = ['EGF',	'FGF1',	'HGF',	'IGF1',	 'Insulin',	'NRG1',	 'PBS',	 'Serum']
        #self.stimuli = ['loLIG1',	'hiLIG1',	'loLIG2',	'hiLIG2']
        # trim unused edges
        if not stimulus in self.nodeStates[0].getColnames():
            raise Exception("Input stimulus '" + stimulus + "' is not in the experiment data")

        #self.trimEdgeByConsensus(0.9)
        stimulusCases = self.perturbData.getValuesByCol(stimulus) == 1
        controlCases = np.sum(self.perturbData.getValuesByCol(self.stimuli), 1) == 0
        
        # identify the nodes to keep by determine if a node responds to a stimuli
        activeNodes = set()
        activeNodes.add(stimulus)
        for nodeId in self.network:            
            if self.network.node[nodeId]['nodeObj'].type == 'FLUORESCENCE' \
            or self.network.node[nodeId]['nodeObj'].type == 'fluorescence':
                nodeControlValues = self.obsData.getValuesByCol(nodeId)[controlCases]
                nodeStimulValues = self.obsData.getValuesByCol(nodeId)[stimulusCases]
                ttestRes = R('t.test')(robjects.FloatVector(nodeControlValues), robjects.FloatVector(nodeStimulValues))
                pvalue = np.array(ttestRes.rx('p.value')[0])[0]
                if pvalue < 0.05:
                    activeNodes.add(self.network.predecessors(nodeId)[0])

        # copy network to a tmp, redirect edges from activation state nodes 
        # Edge indicates the impact 
        tmpNet = nx.DiGraph()
        for u,  v in self.network.edges():
            # we are only interested in the edge from protein point to antibody
            if (self.network.node[u]['nodeObj'].type == 'ACTIVATIONSTATE'\
            or self.network.node[u]['nodeObj'].type == 'activeState')\
            and (self.network.node[v]['nodeObj'].type == 'PHOSPHORYLATIONSTATE'\
            or self.network.node[v]['nodeObj'].type == 'phosState'):
                # extract parameters associated with u and v
                vPreds = self.network.predecessors(v)
                uIndx = vPreds.index(u)
                vParams = np.sum(self.network.node[v]['nodeObj'].params, 0) 
                if len(vParams) != (len(vPreds) + 1):
                    raise Exception ("Bug in retrieving parameters of node v " + u)
                paramZeros = np.sum(self.network.node[v]['nodeObj'].params == 0, 0)
                if np.float(paramZeros[uIndx+1]) / float(self.nChains) > .9:
                    continue  # don't add edge with beta == 0
                    
                for ab in self.dictProteinToAntibody[u]: 
                    if ab not in self.network:
                        continue
                    # find the impact of phosphorylation on activation state
                    uPreds = self.network.predecessors(u)
                    uParams = np.mean(self.network.node[u]['nodeObj'].params, 0) 
                    if len(uParams) != (len(uPreds) + 1):
                        raise Exception ("Bug in retrieving parameters of node v " + u)
                    #uAntibodyParam = uParams[uPreds.index(ab) + 1]
                    
#                    if vParams[uIndx+1] > 0. and (vParams[uIndx+1] * uAntibodyParam) > 0:
#                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
#                    elif (vParams[uIndx+1] * uAntibodyParam) < 0.:
#                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
                    if vParams[uIndx+1] > 0. :
                        tmpNet.add_edge(ab, v, effect = "+", betaValue = vParams[uIndx+1])
                    elif vParams[uIndx+1]  < 0.:
                        tmpNet.add_edge(ab, v, effect = "-", betaValue = vParams[uIndx+1])          
            
        # remove leave nodes that is not in activeNodes list
        while True:
            leafNodes = []
            for nodeId in tmpNet:                     
                if (nodeId not in activeNodes and len(tmpNet.successors(nodeId)) == 0)\
                or (nodeId not in activeNodes and len(tmpNet.predecessors(nodeId)) == 0):
                    leafNodes.append(nodeId)
                    
            if len(leafNodes) == 0:
                break
            
            for leaf in leafNodes:
                tmpNet.remove_node(leaf)
        
        # now try to remove cycles and make the tmpNet a DAG
        return tmpNet
            
                         
                        
    def toGraphML(self, filename):
        tmpNet = nx.DiGraph()
        for edge in self.network.edges():
            tmpNet.add_edge(edge)
            
        nx.write_graphml(tmpNet, filename, encoding='utf-8', prettyprint=True)