def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2): """ sort bacteria in the experiment based on the number of positive/negative samples (ignoring nans) input: expdat : Experiment field,value,exclude,exact : name of field and value of field in order to sort based only on these samples or field=False for all samples (default) maxfval - the maximal f-value output: newexp : Experiment sorted by difference between positive/negative """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude) else: texp=hs.copyexp(expdat) texp.data=np.sign(texp.data) numpos=np.nansum(texp.data>0,axis=1) numneg=np.nansum(texp.data<0,axis=1) pval=np.ones(len(numpos)) for cpos in range(len(pval)): if numpos[cpos]==0 and numneg[cpos]==0: continue pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5) pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5) pval[cpos]=np.min([pval1,pval2]) signs=np.nanmean(texp.data,axis=1) fval=hs.fdr(pval) keep=np.where(np.array(fval)<=maxfval)[0] newexp=hs.reorderbacteria(expdat,keep) signs=signs[keep] si=np.argsort(signs) newexp=hs.reorderbacteria(newexp,si) newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval)) hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat}) return newexp
def annotationenrichment(expdat,seqs,compareseqs=None,fdrval=0.1): """ get a list of annotations enriched in seqs compared to random draw from expdat input: expdat : Experiment seqs : list of sequences ('ACGT') the sequences in which to test the enrichment compareseqs : list of sequences ('ACGT') the sequences to compare to None (default) to compare to all the experiment output: newplist - a sorted list of dict for annotaions which are below fdr ('description','pval','observed','expected') """ # if annotations not initialized - get them (to save time) if expdat.seqannotations is None: expdat=hs.getexpannotations(expdat,usesupercooldb=False) if expdat.annotationseqs is None: expdat=hs.getexpannotations(expdat,usesupercooldb=False) # count the number of annotations for each term in the group # into a dict {term:total number observed in group} groupannotationcount=collections.defaultdict(int) totgroup=0 for cseq in seqs: for cinfo in expdat.seqannotations[cseq]: groupannotationcount[cinfo]+=1 totgroup+=1 # count the number of annotations for each term in the comparison group # into a dict {term:total number observed in comparison group} if compareseqs is None: compareseqs=expdat.seqs compgroupannotationcount=collections.defaultdict(int) totcompgroup=0 for cseq in compareseqs: for cinfo in expdat.seqannotations[cseq]: compgroupannotationcount[cinfo]+=1 totcompgroup+=1 hs.Debug(6,'%d annotations in group, %d in all' % (totgroup,totcompgroup)) # calculate the probability per term # note: we use a bad calculation (can choose the same term twice for a single bacteria). need to improve (permutations?) pvals={} allp=[] pv=[] for cinfo in expdat.annotationseqs.keys(): pcompgroup=float(compgroupannotationcount[cinfo])/totcompgroup pval1=stats.binom.cdf(groupannotationcount[cinfo],totgroup,pcompgroup) pval2=stats.binom.cdf(totgroup-groupannotationcount[cinfo],totgroup,1-pcompgroup) p=np.min([pval1,pval2]) # p=pval1 pvals[cinfo]=p allp.append(p) cpv={} cpv['pval']=p cpv['observed']=groupannotationcount[cinfo] cpv['expected']=pcompgroup*totgroup cpv['description']=cinfo pv.append(cpv) fval=hs.fdr(allp) keep=np.where(np.array(fval)<=fdrval) plist=[] rat=[] for cidx in keep[0]: plist.append(pv[cidx]) rat.append(np.abs(float(pv[cidx]['observed']-pv[cidx]['expected']))/np.mean([pv[cidx]['observed'],pv[cidx]['expected']])) si=np.argsort(rat) si=si[::-1] newplist=[] for idx,crat in enumerate(rat): newplist.append(plist[si[idx]]) for cp in newplist: if cp['observed']>cp['expected']: hs.Debug(6,cp) return(newplist)