def calcTCI (mutcnaMatrixFN, degMatrixFN, tumorTypeFN = None, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0 = 0.3, ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, PANCANFlag = None, rowBegin=0, rowEnd = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. Note the last 19 columns are indicators of the tumor degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. tumorTypeFN A string of filename. The file contains N x T matrix, in which each row only has one element set to 1, rest to zero, as an indicator which type of cancer each tumor belongs to alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor PANCANFlag A boolean flag to indicate if we are doing PANCAN ppiDict A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary) dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix rowBegin, rowEnd These two arguments control allow user to choose which block out of all tumors (defined by the two row numbers) will be processes in by this function. This can be used to process mulitple block in a parallel fashion. """ # check if gene length dictionary is set if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() mutGeneNames = mutcnaMatrix.getColnames() mutTumorNames = mutcnaMatrix.getRownames() degGeneNames = degMatrix.getColnames() exprsTumorNames = degMatrix.getRownames() #check if same tumor names from two matrices above agree if exprsTumorNames != mutTumorNames: print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" print degMatrix.getRownames() print mutcnaMatrix.getRownames() sys.exit() tumorNames = exprsTumorNames nTumors, nMutGenes = mutcnaMatrix.shape() # now perform PANCAN analysis related tasks if PANCANFlag: if not tumorTypeFN: print "Cannot perform PANCAN analysis without tumor-type-indicator matrix" sys.exit() try: tumorTypeMatrix = NamedMatrix(tumorTypeFN) except: print "Failed to import tumor type file %s" % tumorTypeFN sys.exit() tumorTypeTumorNames = [x.replace("\"", "") for x in tumorTypeMatrix.getRownames()] if exprsTumorNames != tumorTypeTumorNames: print "The tumors for tumorTypeMatrix and degMatrix do not fully overlap!" sys.exit() tumorTypes = tumorTypeMatrix.getColnames() # Calculate the prior probability that a tumor-type variable may influence a DEG # to be proportional to the number of tumors from a given type vt = np.sum(tumorTypeMatrix.data, 0) # perform a rowsum to count each type tumor vtprior = np.divide(vt, float(nTumors)) # normalize to 1, as prior for each type of tumor # Now start looping through a chunk of individual tumors and calculate the causal scores between each pair of SGA and DEG print "Done with loading data, start processing tumor " + str(rowBegin) if not rowEnd: rowEnd = nTumors - 1 else: if rowEnd >= nTumors: rowEnd = nTumors - 1 elif rowEnd < rowBegin: print "Invalid rowEnd < rowBegin arguments given." sys.exit() if rowBegin > rowEnd: print "Invlid rowBegin > rowEnd argument given." sys.exit() for t in range(rowBegin, rowEnd): print "processign tumor " + tumorNames[t] #print pacifier if t % 50 == 0: print "\nProcessed %s tumors" % str(t) # collect data related to DEGs to construct a submatrix containing only DEG of the tumor degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] tumorMutGenes= [mutGeneNames[i] for i in tumormutGeneIndx] nTumorMutGenes = len(tumorMutGenes) # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # check if special operations to create combinations of SGA events are needed. If combination operation is needed, # new combined muation matrix will be created if opFlag == OR: tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix) else: # default. Extract columns of mutcnaMatrix corresponding to the altered genes tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] # Include the tumor-type label into the tumorMutMatrix as a tissue-specific # fake Gt to capture the DEGs that has tissue-specific characterisitics if PANCANFlag: tumorTypeLabelIndx = np.where(tumorTypeMatrix.data[t,:] == 1)[0] if len(tumorTypeLabelIndx) != 1: raise Exception("Fail to extract tumor type") # add the label to the tumorMutGenes tumorMutMatrix = np.hstack((tumorMutMatrix, tumorTypeMatrix.data[:,tumorTypeLabelIndx])) tumorTypeName = tumorTypes[tumorTypeLabelIndx] tumorMutGenes.append(tumorTypeName) nTumorMutGenes = len(tumorMutGenes) # calculate single pairwise likelihood that an SGA causes a DEG. Return a matrix where rows are mutGenes, # columns are DEGs, currently without the joint impact tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) # If PANCAN analysis, construct combinations of tumor-type label with different GTs to determine the # likelihood of DEG jointly conditioning on GT and tumor-type label. This enables us to capture # the fact that a GT regulate a GE but they also have a high tendency in co-occurring in a specific tumor type if PANCANFlag: if opFlag == AND: raise Exception ("Combination of AND operation with PanCan analysis is not implemented") # Now, calcuate the log likelihood of joint impact of tumor label with individual GTs on each GE jointGTandTumorLableFScore = np.zeros((tumorMutMatrix.shape[1], tumorDEGMatrix.shape[1])) # GT == 1 && Label == 1. Use mulitplication as AND operation tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx]) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 1 && label == 0 tmpMutMatrix = np.multiply(tumorMutMatrix, tumorTypeMatrix.data[:, tumorTypeLabelIndx]==0) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 0 && label == 1 tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx]) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # GT == 0 && label == 0 tmpMutMatrix = np.multiply(tumorMutMatrix == 0, tumorTypeMatrix.data[:, tumorTypeLabelIndx] == 0) tumorLnFScore = calcF(tmpMutMatrix, tumorDEGMatrix, alphaIJKList) jointGTandTumorLableFScore = add(jointGTandTumorLableFScore, tumorLnFScore) # stack the the joint loglikelihood matrix on top to the tumorLnFScore. #Remove the tumor-type label variable from the matrix derived from tumorMutMatrix tumorLnFScore = np.vstack((jointGTandTumorLableFScore[:-1,:] , tumorLnFScore)) # Calculate the likelihood that A0, which is 1 for all tumors, as a cause for DEGs. # Then, stack to the LnFScore, equivalent to adding a column of '1' to # represent the A0 in tumorMutMatrix nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) # calcualte log of the prior probability that any of mutated genes plus A0 can be a cause for a DEG. if PANCANFlag: if not opFlag: lntumorMutPriors = calcPanCanLnPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0) elif opFlag == AND: lntumorMutPriors = calcPanCanLnCombANDPrior(tumorMutGenes, dictGeneLength, vtprior[tumorTypeLabelIndx], v0) elif opFlag == OR: lntumorMutPriors = calcPanCanLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, vtprior[tumorTypeLabelIndx], v0) else: if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: if opFlag == AND: lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0) elif opFlag == OR: lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0) # add to each column, note double transposes because numpy broadcasts by row tumorLnFScore = np.add(tumorLnFScore.T, lntumorMutPriors).T # calculate the normalizer for each column (GE). colLogSum = calcColNormalizer(tumorLnFScore) normalizer = np.tile(colLogSum, (tumorLnFScore.shape[0], 1)) posteriorAll = np.exp(add(tumorLnFScore, - normalizer)) # now sum the posterior of each single GT with the posteriors of joint GT-Tumor-Type posterior = np.add(posteriorAll[0:nTumorMutGenes-1, :], posteriorAll[nTumorMutGenes - 1:-2, :]) posterior = np.vstack((posterior, posteriorAll[-2:, :])) #write out the results tumorMutGenes.append('A0') tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0=0.2, ppiDict = None, dictGeneLength = None, outputPath = ".", opFlag = None, rowBegin=0, rowEnd = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor ppiDict A dictionary keeps PPI network in the form an adjecency list (a dictionary of dictionary) dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix rowBegin, rowEnd These two arguments control allow user to choose which block out of all tumors (defined by the two row numbers) will be processes in by this function. This can be used to process mulitple block in a parallel fashion. """ # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() exprsTumorNames = [x.replace("\"", "") for x in degMatrix.getRownames()] mutTumorNames = [x.replace("\"", "") for x in mutcnaMatrix.getRownames()] if exprsTumorNames != mutTumorNames: print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" print degMatrix.getRownames() print mutcnaMatrix.getRownames() sys.exit() if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() tumorNames = degMatrix.getRownames() nTumors, nMutGenes = mutcnaMatrix.shape() mutGeneNames = mutcnaMatrix.getColnames() degGeneNames = degMatrix.getColnames() # now we iterate through each tumor to infer the causal relationship between each # pair of mut - deg # loop through individual tumors and calculate the causal scores between each pair of SGA and DEG if not rowEnd: rowEnd = nTumors - 1 else: if rowEnd >= nTumors: rowEnd = nTumors - 1 elif rowEnd < rowBegin: print "Invalid rowEnd < rowBegin arguments given." sys.exit() if rowBegin > rowEnd: print "Invlid rowBegin > rowEnd argument given." sys.exit() print "Done with loading data, start processing tumor " + str(rowBegin) for t in range(rowBegin, rowEnd): #print pacifier if t % 50 == 0: print "Processed %s tumors" % str(t) # collect data related to DEGs. Identify the genes that are differentially expressed in a tumor, # then collect degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] nTumorDEGs = len(degGeneIndx) # corresponding to n, the number of DEGs in a given tumor tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # collect data related to mutations tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] if len(tumormutGeneIndx) < 2: print tumorNames[t] + " has less than 2 mutations, skip." continue tumorMutGenes = [mutGeneNames[i] for i in tumormutGeneIndx] # now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # check if special operations to create combinations of SGA events are needed. If combination operation is needed, # new combined muation matrix will be created if opFlag == AND: tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames) tumorNamedMatrix = createANDComb(tmpNamedMat, opFlag) if not tumorNamedMatrix: # this tumor do not have any joint mutations that is oberved in 2% of all tumors continue tumorMutMatrix = tumorNamedMatrix.data tumorMutGenes = tumorNamedMatrix.colnames elif opFlag == OR: tumorMutMatrix = createORComb(tumorMutGenes, ppiDict, mutcnaMatrix) else: tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] ## check operation options: 1) orginal, do nothing and contiue # otherwise creat combinary matrix using the tumorMutMatrix # createANDCombMatrix(tumorMutMatrix, operationFlag) if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: #print tumorMutGenes[:10] if opFlag == AND: lntumorMutPriors = calcLnCombANDPrior(tumorMutGenes, dictGeneLength, v0) elif opFlag == OR: lntumorMutPriors = calcLnCombORPrior(tumorMutGenes, ppiDict, dictGeneLength, mutcnaMatrix.colnames, v0) tumorMutGenes.append('A0') # calculate the pairwise likelihood that an SGA causes a DEG tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) # Calculate the likelihood of expression data conditioning on A0, and then stack to # the LnFScore, equivalent to adding a column of '1' to represent the A0 in tumorMutMatrix nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) #check out this later # calcualte the prior probability that any of mutated genes can be a cause for a DEG, # tile it up to make an nTumorMutGenes x nTumorDEG matrix tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T lnFScore = add(tumorLnFScore, tumorMutPriorMatrix) # now we need to caclculate the normalized lnFScore so that each columnAccumLogSum = np.zeros(nTumorDEGs) for col in range(nTumorDEGs): currLogSum = np.NINF for j in range(lnFScore.shape[0]): if lnFScore[j,col] == np.NINF: continue currLogSum = logSum(currLogSum, lnFScore[j,col]) columnAccumLogSum[col] = currLogSum normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1)) posterior = np.exp(add(lnFScore, - normalizer)) #write out the results tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) if "\"" in tumorNames[t]: tumorNames[t] = tumorNames[t].replace("\"", "") tumorPosterior.writeToText(filePath = outputPath, filename = tumorNames[t] + ".csv")
def calcTCI (mutcnaMatrixFN, degMatrixFN, alphaNull = [1, 1], alphaIJKList = [2, 1, 1, 2], v0=0.2, dictGeneLength = None, outputPath = ".", opFlag = None): """ calcTCI (mutcnaMatrix, degMatrix, alphaIJList, alphaIJKList, dictGeneLength) Calculate the causal scores between each pair of SGA and DEG observed in each tumor Inputs: mutcnaMatrixFN A file containing a N x G binary matrix containing the mutation and CNA data of all tumors. N is the number of tumors and G is number of total number of unique genes. For a tumor, genes that have SGAs are indicated by "1"s and "0" otherwise. degMatrixFN A file contains a N x G' binary matrix representing DEG status. A "1" indicate a gene is differentially expressed in a tumor. alphaIJList A list of Dirichlet hyperparameters defining the prior that a mutation event occurs alphaIJKList A list of Dirichlet hyperparameters for caulate the prior of condition probability parameters. alphaIJK[0]: mut == 0 && deg == 0; alphaIJK[1]: mut == 0 && deg == 1; alphaIJK[2]: mut == 1 && deg == 0; alphaIJK[3]: mut == 1 && deg == 1 v0 A float scalar indicate the prior probability that a DEG is caused by a non-SGA factor dictGeneLength A dictionary keeps the length of each of G genes in the mutcnaMatrix """ # read in data in the form of NamedMatrix try: mutcnaMatrix = NamedMatrix(mutcnaMatrixFN) except: print "Failed to import data matrix %s\n" % mutcnaMatrixFN sys.exit() try: degMatrix = NamedMatrix(degMatrixFN) except: print "Failed to import data matrix %s\n" % degMatrixFN sys.exit() if degMatrix.getRownames() != mutcnaMatrix.getRownames(): print "The tumors for mutcnaMatrix and degMatrix do not fully overlap!" sys.exit() if not dictGeneLength : print "Gene length dictionary not provided, quit\n" sys.exit() # now we iterate through each tumor to infer the causal relationship between each # pair of mut - deg tumorNames = degMatrix.getRownames() nTumors, nMutGenes = mutcnaMatrix.shape() mutGeneNames = mutcnaMatrix.getColnames() degGeneNames = degMatrix.getColnames() for t in range(nTumors): #print pacifier if t % 50 == 0: print "Processed %s tumors" % str(t) # collect data related to mutations tumormutGeneIndx = [i for i, j in enumerate(mutcnaMatrix.data[t,:]) if j == 1] nTumorMutGenes = len(tumormutGeneIndx) tumorMutGenes= [mutGeneNames[i] for i in tumormutGeneIndx] #now extract the sub-matrix of mutcnaMatrix that only contain the genes that are mutated in a given tumor t # stack a column of '1' to represent the A0. If combination operation is needed, new combined muation matrix # will be created tumorMutMatrix = mutcnaMatrix.data[:, tumormutGeneIndx] if opFlag: tmpNamedMat = NamedMatrix(npMatrix = tumorMutMatrix, colnames = tumorMutGenes, rownames = tumorNames) tumorNamedMatrix = createComb(tmpNamedMat, opFlag) if not tumorNamedMatrix: # this tumor do not have any joint mutations that is oberved in 2% of all tumors continue tumorMutGenes = tumorNamedMatrix.colnames tumorMutMatrix = tumorNamedMatrix.data ## check operation options: 1) orginal, do nothing and contiue # otherwise creat combinary matrix using the tumorMutMatrix # createCombMatrix(tumorMutMatrix, operationFlag) if not opFlag: lntumorMutPriors = calcLnPrior(tumorMutGenes, dictGeneLength, v0) # a m-dimension vector with m being number of mutations else: lntumorMutPriors = calcLnCombPrior(tumorMutGenes, dictGeneLength, v0) tumorMutGenes.append('A0') # collect data related to DEGs degGeneIndx = [i for i, j in enumerate(degMatrix.data[t,:]) if j == 1] tumorDEGGenes = [degGeneNames[i] for i in degGeneIndx] nTumorDEGs = len(degGeneIndx) # corresponding to n, the number of DEGs in a given tumor tumorDEGMatrix = degMatrix.data[:,degGeneIndx] # calculate pair-wise m x n matrix tumorLnFScore = calcF(tumorMutMatrix, tumorDEGMatrix, alphaIJKList) nullFscore = calcNullF(tumorDEGMatrix, alphaNull) tumorLnFScore = np.vstack((tumorLnFScore, nullFscore)) #check out this later # calcualte the prior probability that any of mutated genes can be a cause for a DEG, # tile it up to make an nTumorMutGenes x nTumorDEG matrix tumorMutPriorMatrix = np.tile(lntumorMutPriors, (nTumorDEGs, 1)).T lnFScore = add(tumorLnFScore, tumorMutPriorMatrix) #debug code below two lines #tmpOut = NamedMatrix(npMatrix = lnFScore, colnames = tumorDEGGenes, rownames = tumorMutGenes) #tmpOut.writeToText(outputPath, filename = tumorNames[t] + "fscore.csv") # now we need to caclculate the normalized lnFScore so that each columnAccumLogSum = np.zeros(nTumorDEGs) for col in range(nTumorDEGs): currLogSum = np.NINF for j in range(lnFScore.shape[0]): if lnFScore[j,col] == np.NINF: continue currLogSum = logSum(currLogSum, lnFScore[j,col]) columnAccumLogSum[col] = currLogSum normalizer = np.tile(columnAccumLogSum, (lnFScore.shape[0], 1)) posterior = np.exp(add(lnFScore, - normalizer)) #write out the results tumorPosterior = NamedMatrix(npMatrix = posterior, rownames = tumorMutGenes, colnames = tumorDEGGenes) tumorPosterior.writeToText(outputPath, filename = tumorNames[t] + "-mut-vs-DEG-posterior.csv")