Beispiel #1
0
def main(BgShelve, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1, FgFile,
         FgGeneIDCol, FgTermCol, FgStartRow1, recomputeBg):
    global termSeperator

    if not recomputeBg:
        saved = shelve.open(BgShelve)

        if not saved.has_key("BG"):
            recomputeBg = True
        else:
            BGData = saved["BG"]
            if len(BGData) < 1 or not BGData.has_key("__$$$NGENES"):
                recomputeBg = True

        saved.close()

    if recomputeBg:
        print >> stderr, "recal BG"
        BGData = dict()
        #=shelve.open(BgShelve);
        calTerms(BGData, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1)
        saved = shelve.open(BgShelve)
        saved["BG"] = BGData
        saved.close()

    FGData = dict()
    calTerms(FGData, FgFile, FgGeneIDCol, FgTermCol, FgStartRow1)

    #now calculate GO enrichment and FDR

    FGTerms = FGData.keys()

    nTermsBG = BGData["__$$$NTERMS"]
    nGenesBG = BGData["__$$$NGENES"]
    nTermsFG = FGData["__$$$NTERMS"]
    nGenesFG = FGData["__$$$NGENES"]

    pvalueTermMap = dict()

    for term in FGTerms:
        if "__$$$" in term:
            continue

        if not BGData.has_key(term):
            print >> stderr, "Error: term", term, "not found in background"

        BGTermEntry = BGData[term]
        FGTermEntry = FGData[term]

        pop = nGenesBG
        popt = len(BGTermEntry)
        sam = nGenesFG
        samt = len(FGTermEntry)

        pvalue = pvalue_enrichment(pop, popt, sam, samt)

        if not pvalueTermMap.has_key(pvalue):
            pvalueTermMap[pvalue] = []

        resultVector = [term, pop, popt, sam, samt, FGTermEntry]
        pvalueTermMap[pvalue].append(resultVector)
        print >> stderr, resultVector, ",pvalue=", pvalue

    #now sort pvalue and calculate FDR
    pvalues = sorted(pvalueTermMap.keys())

    nInc = 0

    print >> stdout, "FDR", "\t",
    print >> stdout, "p-value", "\t",
    print >> stdout, "term", "\t",
    print >> stdout, "popt", "\t",
    print >> stdout, "pop", "\t",
    print >> stdout, "popt/pop", "\t",
    print >> stdout, "samt", "\t",
    print >> stdout, "sam", "\t",
    print >> stdout, "samt/sam", "\t",
    print >> stdout, "genes", "\t"

    for pvalue in pvalues:
        termsWithThisPvalue = pvalueTermMap[pvalue]
        nTermsWithThisPvalue = len(termsWithThisPvalue)

        nInc += nTermsWithThisPvalue

        FDR = (pvalue * nTermsFG) / nInc

        #now print;

        for termEntry in termsWithThisPvalue:

            term, pop, popt, sam, samt, FGTermEntry = termEntry

            print >> stdout, FDR, "\t",
            print >> stdout, pvalue, "\t",
            print >> stdout, term, "\t",
            print >> stdout, popt, "\t",
            print >> stdout, pop, "\t",
            print >> stdout, float(popt) / pop, "\t",
            print >> stdout, samt, "\t",
            print >> stdout, sam, "\t",
            print >> stdout, float(samt) / sam, "\t",
            print >> stdout, ",".join(FGTermEntry), "\t"
def main(BgShelve, BgFile, BgGeneIDCol, BgTermCol, BgStartRow1, FgFile, FgGeneIDCol, FgTermCol,FgStartRow1,recomputeBg):
	global termSeperator;
	
	
	if not recomputeBg:
		saved=shelve.open(BgShelve);
		
		if not saved.has_key("BG"):
			recomputeBg=True;		
		else:
			BGData=saved["BG"];
			if len(BGData)<1 or not BGData.has_key("__$$$NGENES"):
				recomputeBg=True;

		saved.close();

	if recomputeBg:
		print >> stderr, "recal BG";
		BGData=dict();#=shelve.open(BgShelve);
		calTerms(BGData,BgFile,BgGeneIDCol,BgTermCol,BgStartRow1);
		saved=shelve.open(BgShelve);
		saved["BG"]=BGData;
		saved.close();


	FGData=dict();
	calTerms(FGData,FgFile,FgGeneIDCol,FgTermCol,FgStartRow1);
	

	#now calculate GO enrichment and FDR

	FGTerms=FGData.keys();

	nTermsBG=BGData["__$$$NTERMS"];
	nGenesBG=BGData["__$$$NGENES"];
	nTermsFG=FGData["__$$$NTERMS"];
	nGenesFG=FGData["__$$$NGENES"];

	pvalueTermMap=dict();

	for term in FGTerms:
		if "__$$$" in term:
			continue;

		if not BGData.has_key(term):
			print >> stderr, "Error: term",term,"not found in background";
		
		BGTermEntry=BGData[term];
		FGTermEntry=FGData[term];

		pop=nGenesBG;
		popt=len(BGTermEntry);
		sam=nGenesFG;
		samt=len(FGTermEntry);
		
		pvalue=pvalue_enrichment(pop,popt,sam,samt);
		
		if not pvalueTermMap.has_key(pvalue):
			pvalueTermMap[pvalue]=[];
		
		resultVector=[term,pop,popt,sam,samt,FGTermEntry]
		pvalueTermMap[pvalue].append(resultVector);
		print >> stderr, resultVector,",pvalue=",pvalue;

	#now sort pvalue and calculate FDR
	pvalues=sorted(pvalueTermMap.keys());
	
	nInc=0;
	
	print >> stdout, "FDR","\t",
	print >> stdout, "p-value","\t",
	print >> stdout, "term","\t",
	print >> stdout, "popt","\t",
	print >> stdout, "pop", "\t",
	print >> stdout, "popt/pop","\t",
	print >> stdout, "samt","\t",
	print >> stdout, "sam","\t",
	print >> stdout, "samt/sam", "\t",
	print >> stdout, "genes","\t";

	for pvalue in pvalues:
		termsWithThisPvalue=pvalueTermMap[pvalue];
		nTermsWithThisPvalue=len(termsWithThisPvalue);
				
		nInc+=nTermsWithThisPvalue;

		FDR=(pvalue*nTermsFG)/nInc;

		#now print;
		

		
		
		for termEntry in termsWithThisPvalue:
			
			term,pop,popt,sam,samt,FGTermEntry=termEntry;

			print >> stdout, FDR,"\t",
			print >> stdout, pvalue,"\t",
			print >> stdout, term,"\t",
			print >> stdout, popt,"\t",
			print >> stdout, pop, "\t",
			print >> stdout, float(popt)/pop,"\t",
			print >> stdout, samt,"\t",
			print >> stdout, sam,"\t",
			print >> stdout, float(samt)/sam, "\t",
			print >> stdout, ",".join(FGTermEntry),"\t";
#!/usr/bin/python
from hypergeom import pvalue_enrichment;

for i in range(1,1000):
	r=pvalue_enrichment(19507150 , 11324 , 22457 , 17);

print r;