def collectMiRNA_totals(pathToFile): lines = open(pathToFile,'rU').readlines() groupedLines = groupByField_silent(lines,0,sep=' : ') groupedLines.sort(key=lambda x: x[0][0]) rDict = {} for miR in groupedLines: data = Bag({'name':miR[0][0], 'orthoTypes':[], 'AGAPs':initList(4,set()) }) for line in miR: if line[1].startswith("allPassedSeedsFor_"): orthoType = int(line[1][-1]) data.orthoTypes.append(orthoType) agaps = [] for group in eval(line[-1]): for gene in group: if gene.startswith('AGAP'): agaps.append(gene) data.AGAPs[orthoType].update(agaps) rDict[data.name]=data return rDict
def collectGeneNames(goTerm,masterGenes,anoXdict): """\tFor a GO term, query anoXdict for which genes in masterGenes(real & ctrls) are tagged with itself, and return a list([ set(real) , [sets(ctrl)] ]). RETURNS: a list([ set(realGenes) , [sets(ctrlGenes)] ]) """ rList = [set(),initList(ctrlNum,set())] notInAnoXcel = set() # -- Collect Reals -- """Should Collect those gene Names not found in AnoXcel""" for gene in masterGenes[0]: try: anoXdata[gene] except KeyError: notInAnoXcel.add(gene) continue geneData = [x[-2] for x in anoXdata[gene]] if goTerm in geneData: rList[0].add(gene) # -- Collect Ctrls -- """Should Collect those gene Names not found in AnoXcel""" for i in range(ctrlNum): for gene in masterGenes[1][i]: try: anoXdata[gene] except KeyError: notInAnoXcel.add(gene) continue geneData = [x[-2] for x in anoXdata[gene]] if goTerm in geneData: rList[1][i].add(gene) # Return return rList
else: anoXdata[row[0][:-3]] = [[row[0]]+row[strt:stp]] goInfo[row[strt:stp][-2]] = row[strt:stp] # Create a library of goTermInfo # -- Good miRNA A1-m8 list -- print "processing Good miRNA A1-m8 list..." A1_m8s = sorted(map(lambda l: l.strip('\n'), open(A1_m8_file,'rU'))) # -- Process Events Pickle -- print "processing Events Pickle..." data_Ca = cPickle.load(open(pklPath_Ca,'rU')) classConvert = {'II':2,'III':3} ctrlNum = len(data_Ca[data_Ca.keys()[0]].ctrlEvents['A1_to_m8']) masterGeneSets = [set(),initList(ctrlNum,set())] getMasterTargetList(useClass,A1_m8s,masterGeneSets, data_Ca) print 'deleting Events Pickle...' del(data_Ca) # free-up some memory to work with! # --------- Main Body --------- # -- Collect GO term Data -- print 'Collecting GO-term Data...'
def writeTargetsFdrMedMeth(miRobj_Ca,miRobj_Cb,oFile): #print 'Processing %s...' % (miRobj_Ca.name) miRHits_Ca = miRobj_Ca.reportGeneTargetsFdrMedMeth(stdvLimit=stdvsAboveMed, consFdrThresh=consFdrThreshold, divide=0) totReal = [None,set(),set(),set()] totCtrl = initList(len(miRobj_Ca.ctrlEvents[miRobj_Ca.ctrlEvents.keys()[0]]),[None, set(), set(),set()]) # Calulate combined FDR for miRNA using Ctrl_b data from seedTypes that passed # the reportGeneTargetsFdrMedMeth() Ctrl_a screen. # >> Gather and combine data from passed seedTypes: for oType in range(1,4): for sType in miRT._seedModels: if miRHits_Ca[sType][oType]: trLen_0 = len(totReal[oType]) totReal[oType].update(miRHits_Ca[sType][oType][0]) rUpdtLen = len(miRHits_Ca[sType][oType][0]) trLen_1 = len(totReal[oType]) None for i in range(len(totCtrl)): tciLen_0 = len(totCtrl[i][oType]) totCtrl[i][oType].update(miRobj_Cb.ctrlEvents[sType][i][oType]) cUpdtLen = len(miRobj_Cb.ctrlEvents[sType][i][oType]) tciLen_1 = len(totCtrl[i][oType]) None # >> Calculate separate FDRs for each Ctrl_b group: totalsData = [None,None,None,None] for oType in range(1,4): if totReal[oType] == set(): continue tempFDRs = [] for i in range(len(totCtrl)): ctrlVal = len(totCtrl[i][oType]) realVal = len(totReal[oType]) if float(ctrlVal)/realVal >= 1: tempFDRs.append(1.0) else: tempFDRs.append(float(ctrlVal)/realVal) tLen = len(tempFDRs) oFDRstdv,oFDRmed = mathDefs.stdDv(tempFDRs,'median') cons_oFDR = oFDRmed + (stdvsAboveMed*oFDRstdv) totalsData[oType] = [totReal[oType],oFDRmed,cons_oFDR] # Write out Totals data: print miRobj_Cb.name outFile.write('-- %s --\n' % (miRobj_Cb.name)) for i in range(1,len(totalsData)): if totalsData[i]: outFile.write('%s : allPassedSeedsFor_%s : %s : %s : %s Seqs=%s\n' \ %(miRobj_Cb.name, i, len(totalsData[i][0]), totalsData[i][2], totalsData[i][1], ','.join(sorted([str(x) for x in totalsData[i][0]])))) # write out passed Seed data for seedType in sorted(miRHits_Ca): for i in range(1,len(miRHits_Ca[seedType])): if miRHits_Ca[seedType][i]: outFile.write('%s : %s : orthoType_%s : %s : %.2f : %.4f Seqs=%s\n'\ %(miRobj_Ca.name, seedType, i, len(miRHits_Ca[seedType][i][0]), miRHits_Ca[seedType][i][2], miRHits_Ca[seedType][i][1], ','.join(sorted([str(x) for x in miRHits_Ca[seedType][i][0]])))) outFile.flush()