def goClassEnrichment(geneCluster,geneClusterName,goClassDict,popSize=None,FDRthresh=0.05): """ goClassEnrichment(geneCluster,goClassDict,popSize,FDRthresh=0.05): geneCluster = set(genesGroupedBySomeQuality) geneClusterName = UniqeID goClassDict = dict(keys=each GO term In bp, cc, or mf class, vals=set(genesAttchd2GOterm) popSize = int(numberOfGenesConsideredAsPopulation) Returns List for each GeneSet GOterm Combo: [GOterm,GOTermSize,geneSetName,geneClstSize,pVal,BHpVal,FDRthresh,numMatchingGenes,matchingGenes]. """ rawPs = [] for goTerm in goClassDict: rawPs.append([goTerm,goEnrichment(geneCluster,goClassDict[goTerm],popSize=popSize)]) adjPs = benjHochFDR(rawPs) for i in range(len(adjPs)): gTrm = adjPs[i][0] p = adjPs[i][1] bhP = adjPs[i][2] mtchGenes = goClassDict[gTrm].intersection(geneCluster) adjPs[i] = [gTrm, str(len(goClassDict[gTrm])), geneClusterName, str(len(geneCluster)), '%.5g' %(p), '%.5g' %(bhP), str(FDRthresh), str(len(mtchGenes)), str(sorted(list(mtchGenes)))] return adjPs
import sys usage = '\n\n\nUSAGE: python rateSNPs.py inFile prob' if len(sys.argv) != 3: print usage exit(1) inFile = sys.argv[1] prob = float(sys.argv[2]) varPos = [] # Calculate and record >= x pVals for line in open(inFile,'rU'): l = [int(x) for x in line.strip('\n').split('\t')] if (l[2] > 0) and (l[1] >= 5): # I think that we want cumulative p-val for x or GREATER mismatches # so we use binom.cdf(x-1,n,prob) <-- need to confirm this. # *** Harsha suggests x or LESS. I am using that untill I can ask XX. # *** Tried it harshas way and things do NOT look right: 546403 29 29 0 1.0 1.0 cumP = 1-binom.cdf(l[2]-1,l[1],prob) varPos.append(l+[cumP]) # Calculate BH adjusted q-vals varPos = benjHochFDR(varPos,pValColumn=4) fOut = open('%s.pr%.5f.qVals' % (inFile,prob), 'w') for item in varPos: fOut.write('%s\n' % ('\t'.join([str(x) for x in item])))
['H30',0.17969873276261300000], ['H31',0.06671269389523740000], ['H32',0.52378879019888900000], ['H33',0.09512201528646370000], ['H34',0.78718046267139600000], ['H35',0.56462313820509200000], ['H36',0.00288596697049005000], ['H37',0.02627986555212090000], ['H38',0.00206440762381543000], ['H39',0.09935440767001180000], ['H40',0.19816649844661500000], ['H41',0.22098313443918600000], ['H42',0.10902751849802400000], ['H43',0.96855523882802700000], ['H44',0.00074450579752643400], ['H45',0.75891146728055300000], ['H46',0.03689279063906480000], ['H47',0.00000657187928204422], ['H48',0.84144926992132700000], ['H49',0.00566326392627456000], ['H50',0.90896231228331500000], ['H51',0.00007291688192959810], ['H52',0.13261734077759300000], ['H53',0.71829596327866800000]] rVals = benjHoch(ps,) nrVals = statsDefs.benjHochFDR(ps,pValColumn=1) test = rVals == nrVals None