def reorderbacteria(exp,order,inplace=False): """ reorder the bacteria in an experiment (can delete if bacteria not in new order) input: exp - the experiment order - the new order output: newexp """ if inplace: newexp=exp else: newexp=copyexp(exp) # newexp=copy.deepcopy(exp) newexp.data=newexp.data[order,:] newexp.seqs=hs.reorder(newexp.seqs,order) newexp.seqdict={} for idx,cseq in enumerate(newexp.seqs): newexp.seqdict[cseq]=idx newexp.tax=hs.reorder(newexp.tax,order) newexp.sids=hs.reorder(newexp.sids,order) # filter the annotations if needed if exp.seqannotations is not None: seqannotations={} annotationseqs=collections.defaultdict(list) for cseq in newexp.seqs: seqannotations[cseq]=newexp.seqannotations[cseq] for cinfo in seqannotations[cseq]: annotationseqs[cinfo].append(cseq) newexp.seqannotations=seqannotations newexp.annotationseqs=annotationseqs return newexp
def plotdistheatmap(gdist,uvals,neworder=False,vmin=0,vmax=1): """ plot a distance heat map and add axis labels input: gdist : numpy array of float the distance matrix (from getgroupdist) uvals : list of strings the names of the categories (from getgroupdist) neworder : list of integers of False if not False, the order by which to sort the matrix and labels prior to plotting vmin,vmax : int (optional) the range for the heatmap """ if neworder: gdist=gdist[neworder,:] gdist=gdist[:,neworder] uvals=hs.reorder(uvals,neworder) plt.figure() iax=plt.imshow(gdist,interpolation='nearest',aspect='auto',vmin=vmin,vmax=vmax) ax=iax.get_axes() ax.set_xticks(range(len(uvals))) ax.set_xticklabels(uvals,rotation=90) ax.set_yticks(range(len(uvals))) ax.set_yticklabels(uvals) plt.tight_layout() plt.draw() plt.colorbar()
def filterfieldwave(expdat,field,val1,val2=False,mineffect=1,method='mean',uselog=True): """ find all sequences which show an effect size of at least mineffect between val1 and val2 samples in field no statistical significance testing is performed input: expdat : Experiment field : string name of field to use for group separation val1 : string value in field for group1 val2 : string value in field for group2 or False for all the other samples except val1 mineffect : float min difference between groups per OTU in order to keep method: string 'ranksum' uselog : bool True to log transform the data output: newexp : Experiment only with sequences showing a mineffect difference """ params=locals() numseqs=len(expdat.seqs) numsamples=len(expdat.samples) dat=expdat.data if uselog: dat[dat<1]=1 dat=np.log2(dat) if method=='ranksum': for idx in range(numseqs): dat[idx,:]=stats.rankdata(dat[idx,:]) pos1=hs.findsamples(expdat,field,val1) if val2: pos2=hs.findsamples(expdat,field,val2) else: pos2=np.setdiff1d(np.arange(numsamples),pos1,assume_unique=True) outpos=[] odif=[] for idx in range(numseqs): cdif=np.mean(dat[idx,pos1])-np.mean(dat[idx,pos2]) if abs(cdif)>=mineffect: outpos.append(idx) odif.append(cdif) si=np.argsort(odif) outpos=hs.reorder(outpos,si) newexp=hs.reorderbacteria(expdat,outpos) newexp.filters.append('filterfieldwave field %s val1 %s val2 %s' % (field,val1,val2)) hs.addcommand(newexp,"filterfieldwave",params=params,replaceparams={'expdat':expdat}) return newexp
def reordersamples(exp,newpos,inplace=False): """ reorder the samples of the experiment input: exp - the experiment newpos - array - the new positions (can skip positions to delete them) output: newexp - the new experiment """ if inplace: newexp=exp else: newexp=copyexp(exp) # newexp=copy.deepcopy(exp) newexp.data=newexp.data[:,newpos] newexp.samples=hs.reorder(newexp.samples,newpos) newexp.origreads=hs.reorder(newexp.origreads,newpos) return newexp
def reorderbacteria(exp,order,inplace=False): """ reorder the bacteria in an experiment (can delete if bacteria not in new order) input: exp - the experiment order - the new order output: newexp """ if inplace: newexp=exp else: newexp=copyexp(exp) # newexp=copy.deepcopy(exp) newexp.data=newexp.data[order,:] newexp.seqs=hs.reorder(newexp.seqs,order) newexp.seqdict={} for idx,cseq in enumerate(newexp.seqs): newexp.seqdict[cseq]=idx newexp.tax=hs.reorder(newexp.tax,order) newexp.sids=hs.reorder(newexp.sids,order) return newexp
def plotdistbar(gdist,uvals,crow=0,neworder=False): """ plot a distance heat map and add axis labels input: gdist : numpy array of float the distance matrix (from getgroupdist) uvals : list of strings the names of the categories (from getgroupdist) neworder : list of integers of False if not False, the order by which to sort the matrix and labels prior to plotting """ if neworder: gdist=gdist[neworder,:] gdist=gdist[:,neworder] uvals=hs.reorder(uvals,neworder) figure() bar(np.arange(len(uvals))-0.5,gdist[crow,:]) ax=gca() ax.set_xticks(range(len(uvals))) ax.set_xticklabels(uvals,rotation=90) tight_layout() draw()
def plotdiffsummary(expdatlist,seqs,field,val1,val2=False,method='mean',sortit=True,threshold=0.1,ptitle=False,showallfirstexp=True): """ plot a heat map for the fold change in each experiment in expdatlist for the log2 foldchange between the 2 groups (val1,val2 values in field) for zech chinese ibd paper input: expdatlist - a list of experiments to plot (row per experiment - all must contain field and val1,val2 in it) seqs - the sequences to examine field - name of the field dividing the 2 groups val1 - value of the field for group 1 (or a list of values 1 per experiment) val2 - value of the field for group 2 or False for all the rest (not val1) (or a list of values 1 per experiment) method: - mean - calculate the difference in the mean of the 2 groups sortit - True to sort according to difference in the first expdat, False to use the order in seqs threshold - minimum value of stat for ratio calculation (otherwise rounded up to threshold) ptitle - name of figure of False for auto title showallfirstexp : bool True - show all sequences, False - show only sequences present in at least one other study except the first output: diffsum - the same as the plotted heatmap (row per otu, column per experiment) expnames - names (studyname) of the experiments plotted (for label) otus - the otu sequences for the rows """ if not(type(val1) is list): tval1=val1 val1=[] for cexp in expdatlist: val1.append(tval1) if not(type(val2) is list): tval2=val2 val2=[] for cexp in expdatlist: val2.append(tval2) diff=np.array(hs.getdiffsummary(expdatlist[0],seqs,field,val1[0],val2[0],method,threshold=threshold)) odiff=copy.copy(diff) odiffnotnan=np.where(np.isfinite(odiff))[0] diffsum=[] for cidx,cexp in enumerate(expdatlist[1:]): cdiff=np.array(hs.getdiffsummary(cexp,seqs,field,val1[cidx+1],val2[cidx+1],method,threshold=threshold)) diff=np.vstack([diff,cdiff]) notnan=np.where(np.isfinite(cdiff))[0] notnan=np.intersect1d(notnan,odiffnotnan) if len(notnan)>0: cdiffsum=float(np.sum((cdiff[notnan]>0)==(odiff[notnan]>0)))/len(notnan) else: cdiffsum=np.nan diffsum.append(cdiffsum) # remove all NaN lines (not enough reads for threshold) if showallfirstexp: nanlines=np.where(~np.isnan(diff).all(axis=0))[0] else: nanlines=np.where(~np.isnan(diff[1:,:]).all(axis=0))[0] diff=diff[:,nanlines] otus=hs.reorder(seqs,nanlines) if sortit: si=np.argsort(diff[0,:]) diff=diff[:,si] otus=hs.reorder(otus,si) figure() maxdiff=np.nanmax(np.abs(diff)) diff=np.transpose(diff) imshow(diff,interpolation='nearest',aspect='auto',cmap=plt.get_cmap("coolwarm"),clim=[-maxdiff,maxdiff]) colorbar() if ptitle: title(ptitle) else: title("log2 fold change between %s and %s in field %s" % (val1,val2,field)) expnames=[] for cexp in expdatlist: expnames.append(cexp.studyname) xticks(np.arange(len(expnames)),expnames,rotation=45) tight_layout() show() return diff,expnames,otus
def filtersimilarsamples(expdat,field,method='mean'): """ join similar samples into one sample (i.e. to remove samples of same individual) input: expdat : Experiment field : string Name of the field containing the values (for which similar values will be joined) method : string What to do with samples with similar value. options: 'mean' - replace with a sample containing the mean of the samples 'median'- replace with a sample containing the median of the samples 'random' - replace with a single random sample out of these samples 'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated 'fracpres' - replace with fraction of samples where the bacteria is present output: newexp : Experiment like the input experiment but only one sample per unique value in field """ params=locals() newexp=hs.copyexp(expdat) if method=='sum': newexp=hs.toorigreads(newexp) uvals=hs.getfieldvals(expdat,field,ounique=True) keep=[] for cval in uvals: cpos=hs.findsamples(expdat,field,cval) if len(cpos)==1: keep.append(cpos[0]) continue if method=='random': keep.append(cpos[np.random.randint(len(cpos))]) continue # set the mapping file values cmap=expdat.smap[expdat.samples[cpos[0]]] for ccpos in cpos[1:]: for cfield in cmap.keys(): if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]: cmap[cfield]='NA' if method=='mean': cval=np.mean(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='median': cval=np.median(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='sum': cval=np.sum(newexp.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos)) keep.append(cpos[0]) elif method=='fracpres': cval=np.sum(expdat.data[:,cpos]>0,axis=1) newexp.data[:,cpos[0]]=cval/len(cpos) keep.append(cpos[0]) else: hs.Debug(9,'method %s not supported' % method) return False newexp.smap[expdat.samples[cpos[0]]]=cmap newexp=hs.reordersamples(newexp,keep) if method=='sum': newexp=hs.normalizereads(newexp) newexp.filters.append('Filter similar samples field %s method %s' % (field,method)) hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp