def showleakage(expdat,seq,wwpp=['1','2','3','4','5','6','7','8']): figure() newexp=hs.filterseqs(expdat,[seq]) for idx,cplate in enumerate(wwpp): print(cplate) cexp=hs.filtersamples(newexp,'primerplate_int',cplate,exact=True) print(len(cexp.samples)) subplot(3,3,idx+1) title(cplate) cmat=np.empty([8,12]) cmat[:] = np.NAN for idx2,csamp in enumerate(cexp.samples): crow=ord(cexp.smap[csamp]['Row'].lower())-ord('A'.lower()) ccol=int(cexp.smap[csamp]['column_int'])-1 cval=cexp.data[0,idx2] if cval>0: cval=np.log2(cval) else: cval=-5 cmat[crow,ccol]=cval imshow(cmat,interpolation='nearest',aspect='auto',clim=[-5,10],cmap=plt.get_cmap("coolwarm")) for idx2,csamp in enumerate(cexp.samples): crow=ord(cexp.smap[csamp]['Row'].lower())-ord('A'.lower()) ccol=int(cexp.smap[csamp]['column_int'])-1 isntc=int(cexp.smap[csamp]['NTC_bool']) if isntc: text(ccol,crow,'x')
def findmislabels(expdat,field,distmetric='bc'): """" find mislabelled samples according to field input: expdat : Experiment field : string name of the field to examine (i.e. subjectid) distmetric : string the distance meteric to use (see calcdist) """ expdat=hs.sortsamples(expdat,field) fvals=hs.getfieldvals(expdat,field) ufvals=list(set(fvals)) onames=[] for idx,csamp in enumerate(expdat.samples): onames.append(csamp+';'+fvals[idx]) omat=np.zeros([len(fvals),len(ufvals)]) for groupidx,groupval in enumerate(ufvals): cexp=hs.filtersamples(expdat,field,groupval,exact=True) for aidx,aval in enumerate(expdat.samples): cdist=[] for gidx,gval in enumerate(cexp.samples): # don't measure distance to ourselves if gval==aval: continue cdist.append(hs.calcdist(cexp.data[:,gidx],expdat.data[:,aidx],distmetric=distmetric)) omat[aidx,groupidx]=np.mean(cdist) figure() iax=imshow(omat,interpolation='nearest',aspect='auto') ax=iax.get_axes() ax.set_xticks(range(len(ufvals))) ax.set_xticklabels(ufvals,rotation=90) ax.set_yticks(range(len(onames))) ax.set_yticklabels(onames)
def sortbyvariance(expdat,field=False,value=False,exact=False,norm=False): """ sort bacteria by their variance sorting is performed based on a subset of samples (field/val/exact) and then all the experiment is sorted according to them input: expdat : Experiment field : string name of the field to filter samples for freq. sorting or False for all samples value : string value of samples to use for the freq. sorting exact : bool is the value exact or partial string norm : bool - False to sort by varinace, True to sort by variance/mean output: newexp : Experiment the experiment with bacteria sorted according to subgroup freq. """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact) else: texp=copy.deepcopy(expdat) svals=np.std(texp.data,axis=1) if norm: svals=svals/np.mean(texp.data,axis=1) svals,sidx=hs.isort(svals) newexp=hs.reorderbacteria(expdat,sidx) newexp.filters.append("sort by variance field=%s value=%s normalize=%s" % (field,value,norm)) hs.addcommand(newexp,"sortbyvariance",params=params,replaceparams={'expdat':expdat}) return newexp
def sortbygroupdiff(expdat,field,val1,val2): """ sort bacteria in the experiment by the difference in the mean between the 2 groups (val1,val2 in field) input: expdat field - the name of the field for the 2 groups val1,val2 - the values for the 2 groups output: newexp - the experiment with sorted bacteria """ params=locals() exp1=hs.filtersamples(expdat,field,val1,exact=True) exp2=hs.filtersamples(expdat,field,val2,exact=True) m1=np.mean(np.log2(exp1.data+2),axis=1) m2=np.mean(np.log2(exp2.data+2),axis=1) diff=(m1-m2)/(m1+m2+20) sv,si=hs.isort(diff) newexp=hs.reorderbacteria(expdat,si) newexp.filters.append("sort by group difference field=%s val1=%s val2=%s" % (field,val1,val2)) hs.addcommand(newexp,"sortbygroupdiff",params=params,replaceparams={'expdat':expdat}) return newexp
def filterwinperid(expdat,idfield,field,val1,val2,mineffect=1): """ do filterfieldwave on each individual (based on idfield) and join the resulting bacteria """ params=locals() iseqs=[] uids=hs.getfieldvals(expdat,idfield,ounique=True) for cid in uids: cexp=hs.filtersamples(expdat,idfield,cid) texp=hs.filterfieldwave(cexp,field,val1,val2,mineffect=mineffect) iseqs+=texp.seqs iseqs=list(set(iseqs)) newexp=hs.filterseqs(expdat,iseqs) return newexp
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2): """ sort bacteria in the experiment based on the number of positive/negative samples (ignoring nans) input: expdat : Experiment field,value,exclude,exact : name of field and value of field in order to sort based only on these samples or field=False for all samples (default) maxfval - the maximal f-value output: newexp : Experiment sorted by difference between positive/negative """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude) else: texp=hs.copyexp(expdat) texp.data=np.sign(texp.data) numpos=np.nansum(texp.data>0,axis=1) numneg=np.nansum(texp.data<0,axis=1) pval=np.ones(len(numpos)) for cpos in range(len(pval)): if numpos[cpos]==0 and numneg[cpos]==0: continue pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5) pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5) pval[cpos]=np.min([pval1,pval2]) signs=np.nanmean(texp.data,axis=1) fval=hs.fdr(pval) keep=np.where(np.array(fval)<=maxfval)[0] newexp=hs.reorderbacteria(expdat,keep) signs=signs[keep] si=np.argsort(signs) newexp=hs.reorderbacteria(newexp,si) newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval)) hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat}) return newexp
def sortbyfreq(expdat,field=False,value=False,exact=False,exclude=False,logscale=True,useabs=False): """ sort bacteria in experiment according to frequency sorting is performed based on a subset of samples (field/val/exact) and then all the experiment is sorted according to them input: expdat : Experiment field : string name of the field to filter samples for freq. sorting or False for all samples value : string value of samples to use for the freq. sorting exact : bool is the value exact or partial string exclude : bool True to sort on all samples except the field/value ones, False to sort only on field/value samples (default=False) logscale : bool True (default) to use log2 transform for frequencies before mean and sorting, False to use original values useabs : bool True to sort by absolute value of freq, False (default) to sort by freq output: newexp : Experiment the experiment with bacteria sorted according to subgroup freq. """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude) else: texp=copy.deepcopy(expdat) if logscale: texp.data[texp.data<2]=2 texp.data=np.log2(texp.data) if useabs: meanvals=np.mean(np.abs(texp.data),axis=1) else: meanvals=np.mean(texp.data,axis=1) svals,sidx=hs.isort(meanvals) newexp=hs.reorderbacteria(expdat,sidx) newexp.filters.append("sort by freq field=%s value=%s" % (field,value)) hs.addcommand(newexp,"sortbyfreq",params=params,replaceparams={'expdat':expdat}) return newexp
def getgroupgroupdist(expdat,field,distmat,dsamp,uvals=False,subfield='host_subject_id',vmin=0,vmax=1): """ calculate the distance matrix based on groups of samples according to field but calculate seperately for each individual and then combine using a distance matrix and mapping for the Amina skin cosmetics study input: expdat : Experiment field : string name of the field to group by distmat : numpy 2d arrau the distance matrix (from calcdistmat or loaddistmat) dsamp : dict the mapping of each sample id to the distance matrix position (from calcdistmat or loaddistmat) uvals : string empty to plot all values, or a list of values to plot only them (in field) subfield : str name of the subfield so all distances are calculated seperately for each subfield value (i.e. 'host_subject_id') """ vals=hs.getfieldvals(expdat,field) if not uvals: uvals=list(set(vals)) svals=hs.getfieldvals(expdat,subfield,ounique=True) omat=np.zeros([len(uvals),len(uvals)]) numok=0 for cval in svals: newexp=hs.filtersamples(expdat,subfield,cval) # dmap,dmapd=hs.loaddistmat(newexp,'amnon/bray_curtis_armpit-diff-log.txt') gdist,uvals=hs.getgroupdist(newexp,field,distmat,dsamp,plotit=False,uvals=uvals) gdist[np.isnan(gdist)]=0 # print(cval) # print(gdist) if np.isnan(np.sum(np.sum(gdist))): continue omat=omat+gdist numok+=1 omat=omat/numok # print('-----') # print(omat) plotdistheatmap(omat,uvals,vmin=vmin,vmax=vmax) return omat
def filtersamples(self): items=self.bMainList.selectedItems() if len(items)!=1: print("Need 1 item") return for citem in items: cname=str(citem.text()) cexp=self.explist[cname] filtersampleswin = FilterSamplesWindow(cexp) res=filtersampleswin.exec_() if res==QtGui.QDialog.Accepted: field=str(filtersampleswin.cField.currentText()) value=str(filtersampleswin.tValue.text()) newname=str(filtersampleswin.tNewName.text()) overwrite=filtersampleswin.cOverwrite.checkState() exclude=filtersampleswin.cExclude.checkState() exact=filtersampleswin.cExact.checkState() newexp=hs.filtersamples(cexp,field,value,exclude=exclude,exact=exact) if overwrite==0: newexp.studyname=newname self.addexp(newexp) else: self.replaceexp(newexp)