def gene_fdr_correction(allgenedict, method): ''' perform p value correction ''' pvaluelabel_list=['beta_pval','beta_pval_neg','beta_pval_pos','beta_permute_pval','beta_permute_pval_neg','beta_permute_pval_pos'] fdr_label_list=[x+'_fdr' for x in pvaluelabel_list] for ii in range(len(pvaluelabel_list)): pvaluemat_list=[] var_fdr_list=[] #whichp='beta_pval' #writep='beta_pval_fdr' whichp=pvaluelabel_list[ii] writep=fdr_label_list[ii] for (gene,ginst) in allgenedict.iteritems(): tlist=getattr(ginst,whichp) pvaluemat_list+=[tlist] # import numpy as np from mageck.fdr_calculation import pFDR pvaluemat=np.matrix(pvaluemat_list) pvaluemat_t=pvaluemat.getT() # row by row for cid in range(pvaluemat_t.shape[0]): p_vec=pvaluemat_t[cid,:].getA1().tolist() fdr_vec=pFDR(p_vec,method) #Tracer()() pvaluemat_t[cid,:]=np.array(fdr_vec) # set up the attribute gid=0 for (gene,ginst) in allgenedict.iteritems(): targetp=pvaluemat_t[:,gid].getA1().tolist() setattr(ginst,writep,targetp) gid+=1
def merge_rank_files(lowfile, highfile, outfile, args): """ Merge neg. and pos. selected files (generated by RRA) into one """ gfile = {} # read files individually nline = 0 for line in open(lowfile): field = line.strip().split() nline += 1 if nline == 1: # skip the first line continue if len(field) < 4: logging.error('The number of fields in file ' + lowfile + ' is <4.') sys.exit(-1) gid = field[0] gitem = int(field[1]) g_lo = float(field[2]) g_p = float(field[3]) g_fdr = float(field[4]) g_goodsgrna = int(field[5]) gfile[gid] = [[gitem, g_lo, g_p, g_fdr, nline - 1, g_goodsgrna]] maxnline = nline nline = 0 for line in open(highfile): field = line.strip().split() nline += 1 if nline == 1: # skip the first line continue if len(field) < 4: logging.error('The number of fields in file ' + highfile + ' is <4.') sys.exit(-1) gid = field[0] gitem = int(field[1]) g_lo = float(field[2]) g_p = float(field[3]) g_fdr = float(field[4]) g_goodsgrna = int(field[5]) if gid not in gfile: logging.warning('Item ' + gid + ' appears in ' + highfile + ', but not in ' + lowfile + '.') #gfile[gid]=[('NA',1.0,1.0,maxnline)] gfile[gid] = [[1.0, 1.0, 1.0, maxnline, 0]] # note that gitem is not saved else: #gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)] if gfile[gid][0][0] != gitem: logging.warning('Item number of ' + gid + ' does not match previous file: ' + str(gitem) + ' !=' + str(gfile[gid][0][0]) + '.') gfile[gid] += [[g_lo, g_p, g_fdr, nline - 1, g_goodsgrna]] # don't repeat the gitem # check whether some items appear in the first group, but not in the second group for (k, v) in gfile.iteritems(): if len(v) == 1: logging.warning('Item ' + gid + ' appears in ' + lowfile + ', but not in ' + highfile + '.') #gfile[gid]+=[('NA',1.0,1.0,maxnline)] gfile[gid] += [[1.0, 1.0, 1.0, maxnline, 0]] # write to files ofhd = open(outfile, 'w') print('\t'.join([ 'id', 'num', 'neg|score', 'neg|p-value', 'neg|fdr', 'neg|rank', 'neg|goodsgrna', 'pos|score', 'pos|p-value', 'pos|fdr', 'pos|rank', 'pos|goodsgrna' ]), file=ofhd) if hasattr(args, 'sort_criteria') and args.sort_criteria == 'pos': logging.debug('Sorting the merged items by positive selection...') skey = sorted(gfile.items(), key=lambda x: x[1][1][0]) else: logging.debug('Sorting the merged items by negative selection...') skey = sorted(gfile.items(), key=lambda x: x[1][0][1]) # correct FDR method from RRA if hasattr(args, 'adjust_method') and args.adjust_method != 'fdr': from mageck.fdr_calculation import pFDR logging.debug('adjusting fdr using ' + args.adjust_method + ' method ...') pnegpool = [ t[1][0][2] for t in skey ] # negative selection: p-value is in item[2], fdr in item[3] ppospool = [ t[1][1][1] for t in skey ] # positive selection: p-value is in item[1], fdr in item[2] dfrnegpool = pFDR(pnegpool, method=args.adjust_method) dfrpospool = pFDR(ppospool, method=args.adjust_method) ind = 0 #import pdb #pdb.set_trace() # skey2 = [] for t in skey: t2 = [t[0], t[1]] t2[1][0][3] = dfrnegpool[ind] t2[1][1][2] = dfrpospool[ind] ind += 1 skey2 += [t2] skey = skey2 # write to file for k in skey: print('\t'.join([k[0], '\t'.join([str(t) for t in k[1][0] + k[1][1]])]), file=ofhd) ofhd.close()
def merge_rank_files(lowfile, highfile, outfile, args, cutoffinfo): """ Merge neg. and pos. selected files (generated by RRA) into one Parameters: lowfile RRA neg. selection output highfile RRA pos. selection output outfile The output file name args arguments cutoffinfo The return value of crispr_test. Include (low_p_threshold, high_p_threshold, lower_gene_lfc,higher_gene_lfc), where lower_gene_lfc={gene:lfc} is the log fold change of sgRNAs """ gfile = {} # read files individually nline = 0 lower_gene_lfc = cutoffinfo[2] higher_gene_lfc = cutoffinfo[3] for line in open(lowfile): field = line.strip().split() nline += 1 if nline == 1: # skip the first line continue if len(field) < 4: logging.error("The number of fields in file " + lowfile + " is <4.") sys.exit(-1) r_o = Rank_Obj() r_o.name = field[0] r_o.sgrna = int(field[1]) r_o.lo = float(field[2]) r_o.pval = float(field[3]) r_o.rank = nline - 1 try: r_o.fdr = float(field[4]) except ValueError: r_o.fdr = "NA" r_o.isbad = True r_o.goodsgrna = int(field[5]) if r_o.name in lower_gene_lfc: g_lfc = "{:.5g}".format(lower_gene_lfc[r_o.name]) else: g_lfc = 0.0 r_o.lfc = g_lfc # gfile[r_o.name]=[[gitem,g_lo,g_p,g_fdr,nline-1,g_goodsgrna,g_lfc]] gfile[r_o.name] = [r_o] maxnline = nline nline = 0 for line in open(highfile): field = line.strip().split() nline += 1 if nline == 1: # skip the first line continue if len(field) < 4: logging.error("The number of fields in file " + highfile + " is <4.") sys.exit(-1) r_o = Rank_Obj() r_o.name = field[0] r_o.sgrna = int(field[1]) r_o.lo = float(field[2]) r_o.pval = float(field[3]) r_o.rank = nline - 1 try: r_o.fdr = float(field[4]) except ValueError: r_o.fdr = "NA" r_o.isbad = True r_o.goodsgrna = int(field[5]) if r_o.name in higher_gene_lfc: g_lfc = "{:.5g}".format(higher_gene_lfc[r_o.name]) else: g_lfc = 0.0 r_o.lfc = g_lfc if r_o.name not in gfile: logging.warning("Item " + r_o.name + " appears in " + highfile + ", but not in " + lowfile + ".") # gfile[gid]=[('NA',1.0,1.0,maxnline)] r_o2 = Rank_Obj() r_o2.rank = maxnline gfile[r_o.name] = [r_o2] # note that gitem is not saved else: # gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)] if gfile[r_o.name][0].sgrna != r_o.sgrna: logging.warning("Item number of " + r_o.name + " does not match previous file: " + str(r_o.sgrna) + " !=" + str(gfile[r_o.name][0].sgrna) + ".") gfile[r_o.name] += [r_o] # don't repeat the gitem # check whether some items appear in the first group, but not in the second group for (k, v) in gfile.items(): if len(v) == 1: logging.warning("Item " + v[0].name + " appears in " + lowfile + ", but not in " + highfile + ".") # gfile[gid]+=[('NA',1.0,1.0,maxnline)] r_o2 = Rank_Obj() r_o2.rank = maxnline gfile[v[0].name] += [r_o2] # note that gitem is not saved # gfile[gid]+=[[1.0,1.0,1.0,maxnline,0,0.0]] # write to files ofhd = open(outfile, "w") print( "\t".join([ "id", "num", "neg|score", "neg|p-value", "neg|fdr", "neg|rank", "neg|goodsgrna", "neg|lfc", "pos|score", "pos|p-value", "pos|fdr", "pos|rank", "pos|goodsgrna", "pos|lfc", ]), file=ofhd, ) if hasattr(args, "sort_criteria") and args.sort_criteria == "pos": logging.debug("Sorting the merged items by positive selection...") skey = sorted(gfile.items(), key=lambda x: x[1][1].rank) else: logging.debug("Sorting the merged items by negative selection...") skey = sorted(gfile.items(), key=lambda x: x[1][0].rank) # correct FDR method from RRA if hasattr(args, "adjust_method") and args.adjust_method != "fdr": from mageck.fdr_calculation import pFDR logging.debug("adjusting fdr using " + args.adjust_method + " method ...") pnegpool = [ t[1][0].pval for t in skey if t[1][0].isbad == False ] # negative selection: p-value is in item[2], fdr in item[3] ppospool = [ t[1][1].pval for t in skey if t[1][1].isbad == False ] # positive selection: p-value is in item[1], fdr in item[2] # logging.info('Size:'+str(len(pnegpool))) dfrnegpool = pFDR(pnegpool, method=args.adjust_method) dfrpospool = pFDR(ppospool, method=args.adjust_method) # import pdb # pdb.set_trace() # ind = 0 for t in skey: if t[1][0].isbad == False: t[1][0].fdr = dfrnegpool[ind] ind += 1 else: t[1][0].fdr = "NA" ind = 0 for t in skey: if t[1][1].isbad == False: t[1][1].fdr = dfrpospool[ind] ind += 1 else: t[1][1].fdr = "NA" # write to file for k in skey: # print('\t'.join([k[0], '\t'.join([str(t) for t in k[1][0]+k[1][1]])]),file=ofhd) negobj = k[1][0] posobj = k[1][1] print("\t".join([negobj.name, str(negobj.sgrna)]), end="\t", file=ofhd) print( "\t".join([ str(x) for x in [ negobj.lo, negobj.pval, negobj.fdr, negobj.rank, negobj.goodsgrna, negobj.lfc, ] ]), end="\t", file=ofhd, ) print( "\t".join([ str(x) for x in [ posobj.lo, posobj.pval, posobj.fdr, posobj.rank, posobj.goodsgrna, posobj.lfc, ] ]), file=ofhd, ) ofhd.close()
def merge_rank_files(lowfile,highfile,outfile,args): """ Merge neg. and pos. selected files (generated by RRA) into one """ gfile={} # read files individually nline=0 for line in open(lowfile): field=line.strip().split() nline+=1 if nline==1: # skip the first line continue if len(field)<4: logging.error('The number of fields in file '+lowfile+' is <4.') sys.exit(-1) gid=field[0] gitem=int(field[1]) g_lo=float(field[2]) g_p=float(field[3]) g_fdr=float(field[4]) g_goodsgrna=int(field[5]) gfile[gid]=[[gitem,g_lo,g_p,g_fdr,nline-1,g_goodsgrna]] maxnline=nline nline=0 for line in open(highfile): field=line.strip().split() nline+=1 if nline==1: # skip the first line continue if len(field)<4: logging.error('The number of fields in file '+highfile+' is <4.') sys.exit(-1) gid=field[0] gitem=int(field[1]) g_lo=float(field[2]) g_p=float(field[3]) g_fdr=float(field[4]) g_goodsgrna=int(field[5]) if gid not in gfile: logging.warning('Item '+gid+' appears in '+highfile+', but not in '+lowfile+'.') #gfile[gid]=[('NA',1.0,1.0,maxnline)] gfile[gid]=[[1.0,1.0,1.0,maxnline,0]] # note that gitem is not saved else: #gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)] if gfile[gid][0][0]!=gitem: logging.warning('Item number of '+gid+' does not match previous file: '+str(gitem)+' !='+str(gfile[gid][0][0])+'.') gfile[gid]+=[[g_lo,g_p,g_fdr,nline-1,g_goodsgrna]] # don't repeat the gitem # check whether some items appear in the first group, but not in the second group for (k,v) in gfile.iteritems(): if len(v)==1: logging.warning('Item '+gid+' appears in '+lowfile+', but not in '+highfile+'.') #gfile[gid]+=[('NA',1.0,1.0,maxnline)] gfile[gid]+=[[1.0,1.0,1.0,maxnline,0]] # write to files ofhd=open(outfile,'w') print('\t'.join(['id','num','neg|score','neg|p-value','neg|fdr','neg|rank','neg|goodsgrna','pos|score','pos|p-value','pos|fdr','pos|rank','pos|goodsgrna']),file=ofhd) if hasattr(args,'sort_criteria') and args.sort_criteria=='pos': logging.debug('Sorting the merged items by positive selection...') skey=sorted(gfile.items(),key=lambda x : x[1][1][0]) else: logging.debug('Sorting the merged items by negative selection...') skey=sorted(gfile.items(),key=lambda x : x[1][0][1]) # correct FDR method from RRA if hasattr(args,'adjust_method') and args.adjust_method!='fdr': from mageck.fdr_calculation import pFDR logging.debug('adjusting fdr using '+args.adjust_method+' method ...') pnegpool=[t[1][0][2] for t in skey] # negative selection: p-value is in item[2], fdr in item[3] ppospool=[t[1][1][1] for t in skey] # positive selection: p-value is in item[1], fdr in item[2] dfrnegpool=pFDR(pnegpool,method=args.adjust_method) dfrpospool=pFDR(ppospool,method=args.adjust_method) ind=0 #import pdb #pdb.set_trace() # skey2=[] for t in skey: t2=[t[0],t[1]] t2[1][0][3]=dfrnegpool[ind] t2[1][1][2]=dfrpospool[ind] ind+=1 skey2+=[t2] skey=skey2 # write to file for k in skey: print('\t'.join([k[0], '\t'.join([str(t) for t in k[1][0]+k[1][1]])]),file=ofhd) ofhd.close()