def rankSomaticMutations_byTreatmentContingency(inputRNAFilename, outputFilename, treatmentsCSVFilename, additionalHeaders = ("Entrez_Gene_Id","Chrom","Start_Position","End_Position","Strand","Variant_Type","Reference_Allele","Tumor_Seq_Allele1","Tumor_Seq_Allele2"), genesHeaderColumnName = "Hugo_Symbol", delimiter = "\t"): ''' Contingency table approach for measuring significant difference in somatic mutations between different treatment groups. The code builds a contingency table for each treatment pair using counts of mutated and non-mutated genes for each treatment. Use the additionalHeaders parameter to add additional headers that may occur in mutation signiture files. ''' from Utility.Utility import Utility import numpy as np import scipy.stats as scipystats USE_NUMPY_WRAPPING = True assert treatmentsCSVFilename assert inputRNAFilename assert outputFilename treatmentsDict = Analysis.buildTreatmentsLists(treatmentsCSVFilename, delimiter) if treatmentsDict not in ({},None): outputFile = open(outputFilename, "w") headerLine = genesHeaderColumnName + delimiter if additionalHeaders not in (None, ()): for addHeader in additionalHeaders: headerLine += addHeader + delimiter for treat1 in treatmentsDict.keys(): for treat2 in treatmentsDict.keys(): if treat1 != treat2: headerLine += "%s_%s_pValue" % (treat1, treat2) + delimiter outputFile.write(headerLine.strip(delimiter) + "\n") with open(inputRNAFilename, "r") as inputRNAFile: columns, indexToName = Utility.getColumns(inputRNAFile) # @UnusedVariable geneList = columns[genesHeaderColumnName] totalMutations = len(geneList) counter = 0 for i, gene in enumerate(geneList): counter += 1 if counter % 1000 == 0: print "Progress completed: %s percent. Working on mutation at gene: %s" % (str((float(counter)/totalMutations)*100), gene) geneOutputLine = gene + delimiter if additionalHeaders not in (None, ()): for addHeader in additionalHeaders: geneOutputLine += columns[addHeader][i] + delimiter samplesDict = {} for treatment in treatmentsDict.keys(): samplesDict[treatment] = [] samplesPerTreatment = treatmentsDict[treatment] for sampleID in samplesPerTreatment: try: samplesDict[treatment].append(columns[sampleID][i]) except: #print "Sample not found sampleID: %s" % (sampleID) pass if USE_NUMPY_WRAPPING: samplesDict[treatment] = np.array(samplesDict[treatment]).astype(np.float) for treat1 in treatmentsDict.keys(): for treat2 in treatmentsDict.keys(): if treat1 != treat2: p = float(1.0) try: p = Analysis.calculateContingencyP_expectedWeighted(samplesDict[treat1], samplesDict[treat2]) geneOutputLine += str(p) + delimiter except: geneOutputLine += str(p) + delimiter outputFile.write(geneOutputLine.strip(delimiter) + "\n") outputFile.flush() outputFile.close() return outputFilename
def rankRNAExpressionTreatments_byMannWhitney(inputRNAFilename, outputFilename, treatmentsCSVFilename, genesHeaderColumnName = "gene_id|gene_id_code", delimiter = "\t"): ''' Method uses Mann-Whitney test to determine variations in the distributions of RNA expression data between treatments. ''' from Utility.Utility import Utility import numpy as np import scipy.stats as scistats TEST_TYPE = "mannwhitneyu" USE_NUMPY_WRAPPING = True assert treatmentsCSVFilename assert inputRNAFilename assert outputFilename treatmentsDict = Analysis.buildTreatmentsLists(treatmentsCSVFilename, delimiter) if treatmentsDict not in ({},None): outputFile = open(outputFilename, "w") headerLine = "gene_id" + delimiter for treat1 in treatmentsDict.keys(): for treat2 in treatmentsDict.keys(): if treat1 != treat2: headerLine += "%s_%s_U_%s" % (treat1, treat2, TEST_TYPE) + delimiter headerLine += "%s_%s_p_%s" % (treat1, treat2, TEST_TYPE) + delimiter outputFile.write(headerLine.strip(delimiter) + "\n") with open(inputRNAFilename, "r") as inputRNAFile: columns, indexToName = Utility.getColumns(inputRNAFile) # @UnusedVariable geneList = columns[genesHeaderColumnName] if geneList[0].find("|") != -1: geneList = [x.split("|")[0] for x in geneList] for i, gene in enumerate(geneList): geneOutputLine = gene + delimiter samplesDict = {} for treatment in treatmentsDict.keys(): samplesDict[treatment] = [] samplesPerTreatment = treatmentsDict[treatment] for sampleID in samplesPerTreatment: try: samplesDict[treatment].append(columns[sampleID][i]) except: #print "Sample not found sampleID: %s" % (sampleID) pass if USE_NUMPY_WRAPPING: samplesDict[treatment] = np.array(samplesDict[treatment]).astype(np.float) for treat1 in treatmentsDict.keys(): for treat2 in treatmentsDict.keys(): if treat1 != treat2: pValue = float(1.0) U = float(0.0) try: U, pValue = scistats.mannwhitneyu(samplesDict[treat1], samplesDict[treat2]) # @UnusedVariable geneOutputLine += str(U) + delimiter geneOutputLine += str(pValue) + delimiter except: geneOutputLine += str(U) + delimiter geneOutputLine += str(pValue) + delimiter outputFile.write(geneOutputLine.strip(delimiter) + "\n") outputFile.flush() outputFile.close() return outputFilename
def WriteVRTransaction(set, writeHeader): transactions, statement, nonUpdatedStatementExporter, simpleStatementExporter = ReadTransaction(set, writeHeader); vrFormat = ValueResearchStatement(statement, writeHeader); Utility.WriteStatement(vrFormat, set.outputVRFormat);