コード例 #1
0
def gene_fdr_correction(allgenedict, method):
  '''
  perform p value correction
  '''
  pvaluelabel_list=['beta_pval','beta_pval_neg','beta_pval_pos','beta_permute_pval','beta_permute_pval_neg','beta_permute_pval_pos']
  fdr_label_list=[x+'_fdr' for x in pvaluelabel_list]
  for ii in range(len(pvaluelabel_list)):
    pvaluemat_list=[]
    var_fdr_list=[]
    #whichp='beta_pval'
    #writep='beta_pval_fdr'
    whichp=pvaluelabel_list[ii]
    writep=fdr_label_list[ii]
    for (gene,ginst) in allgenedict.iteritems():
      tlist=getattr(ginst,whichp)
      pvaluemat_list+=[tlist]
    # 
    import numpy as np
    from mageck.fdr_calculation import pFDR
    pvaluemat=np.matrix(pvaluemat_list)
    pvaluemat_t=pvaluemat.getT()
    # row by row
    for cid in range(pvaluemat_t.shape[0]):
      p_vec=pvaluemat_t[cid,:].getA1().tolist()
      fdr_vec=pFDR(p_vec,method)
      #Tracer()()
      pvaluemat_t[cid,:]=np.array(fdr_vec)
    # set up the attribute
    gid=0
    for (gene,ginst) in allgenedict.iteritems():
      targetp=pvaluemat_t[:,gid].getA1().tolist()
      setattr(ginst,writep,targetp)
      gid+=1
コード例 #2
0
def merge_rank_files(lowfile, highfile, outfile, args):
    """
  Merge neg. and pos. selected files (generated by RRA) into one
  """
    gfile = {}
    # read files individually
    nline = 0
    for line in open(lowfile):
        field = line.strip().split()
        nline += 1
        if nline == 1:  # skip the first line
            continue
        if len(field) < 4:
            logging.error('The number of fields in file ' + lowfile +
                          ' is <4.')
            sys.exit(-1)
        gid = field[0]
        gitem = int(field[1])
        g_lo = float(field[2])
        g_p = float(field[3])
        g_fdr = float(field[4])
        g_goodsgrna = int(field[5])
        gfile[gid] = [[gitem, g_lo, g_p, g_fdr, nline - 1, g_goodsgrna]]
    maxnline = nline
    nline = 0
    for line in open(highfile):
        field = line.strip().split()
        nline += 1
        if nline == 1:  # skip the first line
            continue
        if len(field) < 4:
            logging.error('The number of fields in file ' + highfile +
                          ' is <4.')
            sys.exit(-1)
        gid = field[0]
        gitem = int(field[1])
        g_lo = float(field[2])
        g_p = float(field[3])
        g_fdr = float(field[4])
        g_goodsgrna = int(field[5])
        if gid not in gfile:
            logging.warning('Item ' + gid + ' appears in ' + highfile +
                            ', but not in ' + lowfile + '.')
            #gfile[gid]=[('NA',1.0,1.0,maxnline)]
            gfile[gid] = [[1.0, 1.0, 1.0, maxnline,
                           0]]  # note that gitem is not saved
        else:
            #gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)]
            if gfile[gid][0][0] != gitem:
                logging.warning('Item number of ' + gid +
                                ' does not match previous file: ' +
                                str(gitem) + ' !=' + str(gfile[gid][0][0]) +
                                '.')
            gfile[gid] += [[g_lo, g_p, g_fdr, nline - 1,
                            g_goodsgrna]]  # don't repeat the gitem
    # check whether some items appear in the first group, but not in the second group
    for (k, v) in gfile.iteritems():
        if len(v) == 1:
            logging.warning('Item ' + gid + ' appears in ' + lowfile +
                            ', but not in ' + highfile + '.')
            #gfile[gid]+=[('NA',1.0,1.0,maxnline)]
            gfile[gid] += [[1.0, 1.0, 1.0, maxnline, 0]]
    # write to files
    ofhd = open(outfile, 'w')
    print('\t'.join([
        'id', 'num', 'neg|score', 'neg|p-value', 'neg|fdr', 'neg|rank',
        'neg|goodsgrna', 'pos|score', 'pos|p-value', 'pos|fdr', 'pos|rank',
        'pos|goodsgrna'
    ]),
          file=ofhd)
    if hasattr(args, 'sort_criteria') and args.sort_criteria == 'pos':
        logging.debug('Sorting the merged items by positive selection...')
        skey = sorted(gfile.items(), key=lambda x: x[1][1][0])
    else:
        logging.debug('Sorting the merged items by negative selection...')
        skey = sorted(gfile.items(), key=lambda x: x[1][0][1])
    # correct FDR method from RRA
    if hasattr(args, 'adjust_method') and args.adjust_method != 'fdr':
        from mageck.fdr_calculation import pFDR
        logging.debug('adjusting fdr using ' + args.adjust_method +
                      ' method ...')
        pnegpool = [
            t[1][0][2] for t in skey
        ]  # negative selection: p-value is in item[2], fdr in item[3]
        ppospool = [
            t[1][1][1] for t in skey
        ]  # positive selection: p-value is in item[1], fdr in item[2]
        dfrnegpool = pFDR(pnegpool, method=args.adjust_method)
        dfrpospool = pFDR(ppospool, method=args.adjust_method)
        ind = 0
        #import pdb
        #pdb.set_trace()
        #
        skey2 = []
        for t in skey:
            t2 = [t[0], t[1]]
            t2[1][0][3] = dfrnegpool[ind]
            t2[1][1][2] = dfrpospool[ind]
            ind += 1
            skey2 += [t2]
        skey = skey2
    # write to file
    for k in skey:
        print('\t'.join([k[0],
                         '\t'.join([str(t) for t in k[1][0] + k[1][1]])]),
              file=ofhd)

    ofhd.close()
コード例 #3
0
def merge_rank_files(lowfile, highfile, outfile, args, cutoffinfo):
    """
  Merge neg. and pos. selected files (generated by RRA) into one
  Parameters:
    lowfile
        RRA neg. selection output
    highfile
        RRA pos. selection output
    outfile
        The output file name
    args
        arguments
    cutoffinfo
        The return value of crispr_test. Include (low_p_threshold, high_p_threshold, lower_gene_lfc,higher_gene_lfc), where lower_gene_lfc={gene:lfc} is the log fold change of sgRNAs
  """
    gfile = {}
    # read files individually
    nline = 0
    lower_gene_lfc = cutoffinfo[2]
    higher_gene_lfc = cutoffinfo[3]
    for line in open(lowfile):
        field = line.strip().split()
        nline += 1
        if nline == 1:  # skip the first line
            continue
        if len(field) < 4:
            logging.error("The number of fields in file " + lowfile +
                          " is <4.")
            sys.exit(-1)
        r_o = Rank_Obj()
        r_o.name = field[0]
        r_o.sgrna = int(field[1])
        r_o.lo = float(field[2])
        r_o.pval = float(field[3])
        r_o.rank = nline - 1
        try:
            r_o.fdr = float(field[4])
        except ValueError:
            r_o.fdr = "NA"
            r_o.isbad = True
        r_o.goodsgrna = int(field[5])
        if r_o.name in lower_gene_lfc:
            g_lfc = "{:.5g}".format(lower_gene_lfc[r_o.name])
        else:
            g_lfc = 0.0
        r_o.lfc = g_lfc
        # gfile[r_o.name]=[[gitem,g_lo,g_p,g_fdr,nline-1,g_goodsgrna,g_lfc]]
        gfile[r_o.name] = [r_o]
    maxnline = nline
    nline = 0
    for line in open(highfile):
        field = line.strip().split()
        nline += 1
        if nline == 1:  # skip the first line
            continue
        if len(field) < 4:
            logging.error("The number of fields in file " + highfile +
                          " is <4.")
            sys.exit(-1)
        r_o = Rank_Obj()
        r_o.name = field[0]
        r_o.sgrna = int(field[1])
        r_o.lo = float(field[2])
        r_o.pval = float(field[3])
        r_o.rank = nline - 1
        try:
            r_o.fdr = float(field[4])
        except ValueError:
            r_o.fdr = "NA"
            r_o.isbad = True
        r_o.goodsgrna = int(field[5])
        if r_o.name in higher_gene_lfc:
            g_lfc = "{:.5g}".format(higher_gene_lfc[r_o.name])
        else:
            g_lfc = 0.0
        r_o.lfc = g_lfc
        if r_o.name not in gfile:
            logging.warning("Item " + r_o.name + " appears in " + highfile +
                            ", but not in " + lowfile + ".")
            # gfile[gid]=[('NA',1.0,1.0,maxnline)]
            r_o2 = Rank_Obj()
            r_o2.rank = maxnline
            gfile[r_o.name] = [r_o2]  # note that gitem is not saved
        else:
            # gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)]
            if gfile[r_o.name][0].sgrna != r_o.sgrna:
                logging.warning("Item number of " + r_o.name +
                                " does not match previous file: " +
                                str(r_o.sgrna) + " !=" +
                                str(gfile[r_o.name][0].sgrna) + ".")
        gfile[r_o.name] += [r_o]  # don't repeat the gitem
    # check whether some items appear in the first group, but not in the second group
    for (k, v) in gfile.items():
        if len(v) == 1:
            logging.warning("Item " + v[0].name + " appears in " + lowfile +
                            ", but not in " + highfile + ".")
            # gfile[gid]+=[('NA',1.0,1.0,maxnline)]
            r_o2 = Rank_Obj()
            r_o2.rank = maxnline
            gfile[v[0].name] += [r_o2]  # note that gitem is not saved
            # gfile[gid]+=[[1.0,1.0,1.0,maxnline,0,0.0]]
    # write to files
    ofhd = open(outfile, "w")
    print(
        "\t".join([
            "id",
            "num",
            "neg|score",
            "neg|p-value",
            "neg|fdr",
            "neg|rank",
            "neg|goodsgrna",
            "neg|lfc",
            "pos|score",
            "pos|p-value",
            "pos|fdr",
            "pos|rank",
            "pos|goodsgrna",
            "pos|lfc",
        ]),
        file=ofhd,
    )
    if hasattr(args, "sort_criteria") and args.sort_criteria == "pos":
        logging.debug("Sorting the merged items by positive selection...")
        skey = sorted(gfile.items(), key=lambda x: x[1][1].rank)
    else:
        logging.debug("Sorting the merged items by negative selection...")
        skey = sorted(gfile.items(), key=lambda x: x[1][0].rank)
    # correct FDR method from RRA
    if hasattr(args, "adjust_method") and args.adjust_method != "fdr":
        from mageck.fdr_calculation import pFDR

        logging.debug("adjusting fdr using " + args.adjust_method +
                      " method ...")
        pnegpool = [
            t[1][0].pval for t in skey if t[1][0].isbad == False
        ]  # negative selection: p-value is in item[2], fdr in item[3]
        ppospool = [
            t[1][1].pval for t in skey if t[1][1].isbad == False
        ]  # positive selection: p-value is in item[1], fdr in item[2]
        # logging.info('Size:'+str(len(pnegpool)))
        dfrnegpool = pFDR(pnegpool, method=args.adjust_method)
        dfrpospool = pFDR(ppospool, method=args.adjust_method)
        # import pdb
        # pdb.set_trace()
        #
        ind = 0
        for t in skey:
            if t[1][0].isbad == False:
                t[1][0].fdr = dfrnegpool[ind]
                ind += 1
            else:
                t[1][0].fdr = "NA"
        ind = 0
        for t in skey:
            if t[1][1].isbad == False:
                t[1][1].fdr = dfrpospool[ind]
                ind += 1
            else:
                t[1][1].fdr = "NA"
    # write to file
    for k in skey:
        # print('\t'.join([k[0], '\t'.join([str(t) for t in k[1][0]+k[1][1]])]),file=ofhd)
        negobj = k[1][0]
        posobj = k[1][1]
        print("\t".join([negobj.name, str(negobj.sgrna)]), end="\t", file=ofhd)
        print(
            "\t".join([
                str(x) for x in [
                    negobj.lo,
                    negobj.pval,
                    negobj.fdr,
                    negobj.rank,
                    negobj.goodsgrna,
                    negobj.lfc,
                ]
            ]),
            end="\t",
            file=ofhd,
        )
        print(
            "\t".join([
                str(x) for x in [
                    posobj.lo,
                    posobj.pval,
                    posobj.fdr,
                    posobj.rank,
                    posobj.goodsgrna,
                    posobj.lfc,
                ]
            ]),
            file=ofhd,
        )

    ofhd.close()
コード例 #4
0
ファイル: fileOps.py プロジェクト: yarker/MAGeCK_Repo
def merge_rank_files(lowfile,highfile,outfile,args):
  """
  Merge neg. and pos. selected files (generated by RRA) into one
  """
  gfile={}
  # read files individually
  nline=0
  for line in open(lowfile):
    field=line.strip().split()
    nline+=1
    if nline==1: # skip the first line
      continue
    if len(field)<4:
      logging.error('The number of fields in file '+lowfile+' is <4.')
      sys.exit(-1)
    gid=field[0]
    gitem=int(field[1])
    g_lo=float(field[2])
    g_p=float(field[3])
    g_fdr=float(field[4])
    g_goodsgrna=int(field[5])
    gfile[gid]=[[gitem,g_lo,g_p,g_fdr,nline-1,g_goodsgrna]]
  maxnline=nline
  nline=0
  for line in open(highfile):
    field=line.strip().split()
    nline+=1
    if nline==1: # skip the first line
      continue
    if len(field)<4:
      logging.error('The number of fields in file '+highfile+' is <4.')
      sys.exit(-1)
    gid=field[0]
    gitem=int(field[1])
    g_lo=float(field[2])
    g_p=float(field[3])
    g_fdr=float(field[4])
    g_goodsgrna=int(field[5])
    if gid not in gfile:
      logging.warning('Item '+gid+' appears in '+highfile+', but not in '+lowfile+'.')
      #gfile[gid]=[('NA',1.0,1.0,maxnline)]
      gfile[gid]=[[1.0,1.0,1.0,maxnline,0]] # note that gitem is not saved
    else:
      #gfile[gid]+=[(gitem,g_p,g_fdr,nline-1)]
      if gfile[gid][0][0]!=gitem:
        logging.warning('Item number of '+gid+' does not match previous file: '+str(gitem)+' !='+str(gfile[gid][0][0])+'.')
      gfile[gid]+=[[g_lo,g_p,g_fdr,nline-1,g_goodsgrna]] # don't repeat the gitem
  # check whether some items appear in the first group, but not in the second group
  for (k,v) in gfile.iteritems():
    if len(v)==1:
      logging.warning('Item '+gid+' appears in '+lowfile+', but not in '+highfile+'.')
      #gfile[gid]+=[('NA',1.0,1.0,maxnline)]
      gfile[gid]+=[[1.0,1.0,1.0,maxnline,0]]
  # write to files
  ofhd=open(outfile,'w')
  print('\t'.join(['id','num','neg|score','neg|p-value','neg|fdr','neg|rank','neg|goodsgrna','pos|score','pos|p-value','pos|fdr','pos|rank','pos|goodsgrna']),file=ofhd)
  if hasattr(args,'sort_criteria') and args.sort_criteria=='pos':
    logging.debug('Sorting the merged items by positive selection...')
    skey=sorted(gfile.items(),key=lambda x : x[1][1][0])
  else:
    logging.debug('Sorting the merged items by negative selection...')
    skey=sorted(gfile.items(),key=lambda x : x[1][0][1])
  # correct FDR method from RRA
  if hasattr(args,'adjust_method') and args.adjust_method!='fdr':
    from mageck.fdr_calculation import pFDR
    logging.debug('adjusting fdr using '+args.adjust_method+' method ...')
    pnegpool=[t[1][0][2] for t in skey] # negative selection: p-value is in item[2], fdr in item[3]
    ppospool=[t[1][1][1] for t in skey] # positive selection: p-value is in item[1], fdr in item[2]
    dfrnegpool=pFDR(pnegpool,method=args.adjust_method)
    dfrpospool=pFDR(ppospool,method=args.adjust_method)
    ind=0
    #import pdb 
    #pdb.set_trace()
    #
    skey2=[]
    for t in skey:
      t2=[t[0],t[1]]
      t2[1][0][3]=dfrnegpool[ind]
      t2[1][1][2]=dfrpospool[ind]
      ind+=1
      skey2+=[t2]
    skey=skey2
  # write to file
  for k in skey:
    print('\t'.join([k[0], '\t'.join([str(t) for t in k[1][0]+k[1][1]])]),file=ofhd)
  
  ofhd.close()