def sortbycentermass(expdat,field=False,numeric=True,uselog=True): """ sort bacteria in the experiment according to a 1d gradient by calculating the center of mass input: expdat field : string the name of the field to sort by or False to skip sorting numeric : bool True if the sort field is numeric (ignored if no sort field) uselog : bool True to log transform the data before mass center calculation output: newexp - the experiment with sorted bacteria """ params=locals() if field: newexp=hs.sortsamples(expdat,field,numeric=numeric) else: newexp=hs.copyexp(expdat) dat=newexp.data if uselog: dat[dat<1]=1 dat=np.log2(dat) cm=[] multpos=np.arange(len(newexp.samples)) for cseqind in range(len(newexp.seqs)): cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:])) sv,si=hs.isort(cm) newexp=hs.reorderbacteria(expdat,si) newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog)) hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat}) return newexp
def savetsvtable(expdat,filename,logtransform=True): """ save an experiment as a tab separated table, with columns for samples and rows for bacteria for jose clemente long babies paper input: expdat filename - name of the output tsv file minreads - save only bacteria with >=minreads reads logtransform - True to save the log2 of the reads, False to save the reads """ ldat=hs.copyexp(expdat.data) if logtransform: ldat[np.where(ldat<1)]=1 ldat=np.log2(ldat) of=open(filename,'w') of.write("Taxonomy\tSequence") for csamp in expdat.samples: of.write("\t%s" % csamp) of.write("\n") for idx,cseq in enumerate(expdat.seqs): of.write("%s\t%s" % (expdat.tax[idx],cseq)) for cval in ldat[idx,:]: of.write("\t%f" % cval) of.write("\n") of.close()
def normalizeprctile(expdat,percent=80): """ normalize reads per experiment so percentile (rather than mean) will be normalized used to reduce effect of outliers (compositionality correction) note normalization is done on the same set of bacteria for all samples input: expdat : Experiment percent : float the percentile to normalize (0-100) output: newexp : Experiment the new normalized experiment """ params=locals() # select the bacteria to use - don't want to include very low freq. bacteria newexp=hs.filterminreads(expdat,1*len(expdat.samples)) percvals=np.percentile(newexp.data,percent,axis=0) # plt.figure() # plt.plot(percvals) percvals=percvals/np.mean(percvals) newexp=hs.copyexp(expdat) for idx,samp in enumerate(expdat.samples): newexp.data[:,idx]=newexp.data[:,idx]*percvals[idx] newexp.filters.append("normalize percentile %f" % percent) hs.addcommand(newexp,"normalizeprctile",params=params,replaceparams={'expdat':expdat}) return newexp
def convertdatefield(expdat,field,newfield,timeformat='%m/%d/%y %H:%M'): """ convert a field containing date/time to a numeric (seocds since epoch) field (create a new field for that) input: expdat : Experiment the experiment to add the field to field : string name of the field containing the date/time format newfield : string name of the new field (with seconds since epoch) timeformat : string format of the date/time field (based on time format) output: newexp : Experiment the experiment with the added time since epoch field """ params=locals() newexp=hs.copyexp(expdat) newexp.fields.append(newfield) for csamp in newexp.samples: newexp.smap[csamp][newfield]=time.mktime(time.strptime(newexp.smap[csamp][field],timeformat)) newexp.filters.append('add time field %s (based on field %s)' % (newfield,field)) hs.addcommand(newexp,"convertdatefield",params=params,replaceparams={'expdat':expdat}) return(newexp)
def reloadmap(expdat,mapfilename): """ reload the mapping file for a loaded experiment input: expdat : Experiment mapfilename : string Name of the mapping file to reload output: newexp : Experiment like expdat but with fields from new map file """ params=locals() newexp=hs.copyexp(expdat) mapsamples,smap,fields,mapmd5=loadmap(mapfilename) newexp.smap=smap newexp.fields=fields newexp.mapmd5=mapmd5 for csamp in newexp.samples: if csamp not in mapsamples: hs.Debug(7,'Sample %s not in new map!' % csamp) newexp.filters.append('reload map %s' % mapfilename) hs.addcommand(newexp,"reloadmapfile",params=params,replaceparams={'expdat':expdat}) return newexp
def renamesamples(expdat,addstr,addbefore=True): """ rename all the samples in expdat by adding addbefore before or after the name of each sample input: expdat : Experiment the experiment to change the sample names in addstr : str the string to add to each sampleid addbefore : bool (optional) True (default) to add addstr before each sampleid False to add addstr after each sampleid output: newexp : Experiment with new sample names """ newexp=hs.copyexp(expdat) newids=[] newmap={} for csamp in newexp.samples: if addbefore: cnewid=addstr+csamp else: cnewid=csamp+addstr newids.append(cnewid) newmap[cnewid]={} for ckey,cval in newexp.smap[csamp].items(): newmap[cnewid][ckey]=cval newexp.samples=newids newexp.smap=newmap return newexp
def toorigreads(expdat,inplace=False): """ convert the number of reads to absolute using the origreads field input: expdat inplace - True to replace current exp, false to create a new one output: newexp - each sample has origreads reads (instead of 10k) """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) for idx,csamp in enumerate(newexp.samples): totreads=np.sum(newexp.data[:,idx]) origreads=newexp.origreads[idx] if totreads==0: continue ratio=float(origreads)/totreads newexp.data[:,idx]=newexp.data[:,idx]*ratio newexp.filters.append("changed reads to origread value") hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat}) return newexp
def toorigreads(expdat,inplace=False): """ convert the number of reads to absolute using the origreads field input: expdat inplace - True to replace current exp, false to create a new one output: newexp - each sample has origreads reads (instead of 10k) """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) newexp.data=hs.multvec(newexp.data,newexp.scalingfactor) newexp.data=np.round(newexp.data) newexp.data=newexp.data.astype(int) newexp.scalingfactor=1 newexp.filters.append("changed reads to origread value") hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat}) return newexp
def fieldtobact(expdat,field,bactname='',meanreads=1000,cutoff=0): """ convert values in a map file field to a new bacteria (to facilitate numeric analysis) input: expdat : Experiment field : string name of the field to convert bactname : string name of the new bacteria (empty to have similar to field name) meanreads : int the mean number of reads for the new field bacteria cutoff : int the minimal value of the field per sample (otherwise replace with meanreads) output: newexp : Experiment with added bacteria with the field vals as reads """ params=locals() if len(bactname)==0: bactname=field fv=hs.getfieldvals(expdat,field) vals=np.array(hs.tofloat(fv)) okpos=np.where(vals>=cutoff)[0] badpos=np.where(vals<cutoff)[0] scalefactor=np.mean(vals[okpos]) vals[okpos]=(vals[okpos]/scalefactor)*meanreads vals[badpos]=meanreads newexp=hs.copyexp(expdat) hs.insertbacteria(newexp,vals,bactname,bactname,logit=False) newexp.filters.append('add bacteria from map field %s' % field) hs.addcommand(newexp,"fieldtobact",params=params,replaceparams={'expdat':expdat}) return(newexp)
def addmapfield(expdat,fieldname,defaultval='NA',inplace=False): """ add a new field to the mapping file input: expdat : Experiment fieldname : str name of the new field defaultval : str the value for all samples inplace : bool True to overwrite current experiment, False (default) to copy output: newexp : Experiment with the new field added """ if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) if fieldname in newexp.fields: hs.Debug(8,'field %s already exists') return newexp newexp.fields.append(fieldname) for csamp in newexp.samples: newexp.smap[csamp][fieldname]=defaultval return newexp
def normalizereads(expdat,numreads=10000,fixorig=False,inplace=False): """ normalize the number of reads per sample (default to 10k) input: expdat numreads - the number of reads to normalize to fixorig - True to fix origreads with the same ratio, False to keep as before inplace - true to replace orig experiment, false to create a new experiment output: newexp - the normalized experiment """ params=locals() if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) for idx,csamp in enumerate(newexp.samples): totreads=np.sum(newexp.data[:,idx]) if totreads==0: continue ratio=float(numreads)/totreads newexp.data[:,idx]=newexp.data[:,idx]*ratio if fixorig: hs.Debug(2,'fixing original frequencies') newexp.origreads[idx]=float(newexp.origreads[idx])/ratio newexp.filters.append("renormalized reads to sum %d" % numreads) hs.addcommand(newexp,"normalizereads",params=params,replaceparams={'expdat':expdat}) return newexp
def normalizebyseqs(expdat,seqs,exclude=False,fixorig=True): """ normalize experiment by making the sum of frequencies in seqs constant in each sample input: expdat seqs - the sequences to use as the normalization factor (sum of the sequences) exclude - true to use all sequences except in seqs as the normalization factor, False to use seqs fixorig - True to modify the origreads field, false to leave it as it was """ params=locals() newexp=hs.copyexp(expdat) spos=[] for cseq in seqs: spos.append(expdat.seqdict[cseq]) if exclude: spos=np.setdiff1d(np.arange(len(expdat.seqs)),spos) ssum=np.sum(expdat.data[spos,:],axis=0)+0.0 ssum[ssum==0]=1 frat=ssum/np.mean(ssum) for idx in range(len(expdat.samples)): newexp.data[:,idx]=newexp.data[:,idx]/frat[idx] if fixorig: newexp.origreads[idx]=newexp.origreads[idx]/frat[idx] newexp.scalingfactor[idx]=newexp.scalingfactor[idx]*frat filt='Normalize By Seqs ' if len(spos)==1: filt+=newexp.tax[spos[0]] else: filt+=str(len(spos)) if exclude: filt+=' Exclude' newexp.filters.append(filt) hs.addcommand(newexp,"normalizebyseqs",params=params,replaceparams={'expdat':expdat}) return newexp
def addsubtrees(expdat,tree,inplace=False): """ add otus for all subtrees with the frequency being the sum of all bacteria in the subtree input: expdat - the experiment tree - the tree for the experiment inplace - if true, replace current experiment output: newexp - the new experiment with twice-1 number of otus """ params=locals() # if not expdat.tree: # hs.Debug(8,"No tree loaded for experiment") # return False if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) subtrees=tree.subsets() for csubtree in subtrees: newname="" newtax="" numuse=0 newfreq=np.zeros([1,len(newexp.samples)]) for cbact in csubtree: if cbact not in newexp.seqdict: hs.Debug(4,'sequence not in seqdict',cbact) continue numuse+=1 cpos=newexp.seqdict[cbact] newfreq+=newexp.data[cpos,:] newname+='%d,' % cpos if newtax=='': newtax=newexp.tax[cpos] else: newtax=hs.common_start(newtax,newexp.tax[cpos]) # add only if we have 2 bacteria or more if numuse>1: if newname not in newexp.seqdict: newexp,newpos=insertbacteria(newexp,freqs=newfreq,seq=newname,tax=newtax,logit=False) newexp.filters.append("Add subtrees") hs.addcommand(newexp,"addsubtrees",params=params,replaceparams={'expdat':expdat}) return(newexp)
def filtermapfields(expdat,fields=['#SampleID'],keep=True,inplace=False): """ filter fields from the experiment mapping data input: expdat : Experiment fields : list of str the list of the fields to keep/remove keep : bool (optional) True (default) to keep only the fields specified False to remove the fields specified inplace : bool (optional) False (default) to create new experiment True to replace in current experiment output: newexp : Experiment with only the fields requested """ params=locals() newsmap={} newfields=set(expdat.fields) if keep: newfields=newfields.intersection(set(fields)) else: newfields=newfields.difference(set(fields)) newfields.add('#SampleID') for csamp in expdat.samples: newsmap[csamp]={} for cfield in newfields: newsmap[csamp][cfield]=expdat.smap[csamp][cfield] if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) newexp.fields=list(newfields) newexp.smap=newsmap expdat.filters.append('filter map fields %s (keep=%s)' % (fields,keep)) hs.addcommand(expdat,"filtermapfields",params=params,replaceparams={'expdat':expdat}) return newexp
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2): """ sort bacteria in the experiment based on the number of positive/negative samples (ignoring nans) input: expdat : Experiment field,value,exclude,exact : name of field and value of field in order to sort based only on these samples or field=False for all samples (default) maxfval - the maximal f-value output: newexp : Experiment sorted by difference between positive/negative """ params=locals() if field: texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude) else: texp=hs.copyexp(expdat) texp.data=np.sign(texp.data) numpos=np.nansum(texp.data>0,axis=1) numneg=np.nansum(texp.data<0,axis=1) pval=np.ones(len(numpos)) for cpos in range(len(pval)): if numpos[cpos]==0 and numneg[cpos]==0: continue pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5) pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5) pval[cpos]=np.min([pval1,pval2]) signs=np.nanmean(texp.data,axis=1) fval=hs.fdr(pval) keep=np.where(np.array(fval)<=maxfval)[0] newexp=hs.reorderbacteria(expdat,keep) signs=signs[keep] si=np.argsort(signs) newexp=hs.reorderbacteria(newexp,si) newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval)) hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat}) return newexp
def zerobacteria(expdat,inplace=False): """ zero all the bacteria in an experiment (can then add insertbacteria) input: expdat : Experiment inplace : bool True to do inplace, False to make new copy output: newexp : Experiment all bacteria have been removed """ if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) newexp.data=np.zeros([0,len(newexp.samples)]) newexp.seqs=[] newexp.tax=[] newexp.seqdict={} newexp.sids=[] return newexp
def taxtoseq(expdat,fixtax=False): """ put the taxonomy into the sequence field input: expdat : Experiment fixtax: bool (optional) False (default) to just copy, True to remove the k__ etc. output: newexp : Experiment with seqs=taxonomies """ newexp=hs.copyexp(expdat) newexp.seqs=newexp.tax if fixtax: newtax=[] for ctax in newexp.tax: cstr='' cctax=ctax.split(';') for clevel in range(7): if len(cctax)>clevel: cstr+=cctax[clevel][3:] cstr+=';' newtax.append(cstr) newexp.seqs=newtax newexp.seqdict={} newseqs=[] for idx,cseq in enumerate(newexp.seqs): if cseq in newexp.seqdict: hs.Debug(8,'found %s again' % cseq) cseq=cseq+'-'+str(idx) newseqs.append(cseq) newexp.seqdict[cseq]=idx newexp.seqs=newseqs return(newexp)
def cleantaxonomy(self): items=self.bMainList.selectedItems() if len(items)!=1: print("Need 1 item") return for citem in items: cname=str(citem.text()) cexp=self.explist[cname] ctwin = CleanTaxonomyWindow(cexp) res=ctwin.exec_() if res==QtGui.QDialog.Accepted: newexp=hs.copyexp(cexp) if ctwin.cMitochondria.checkState(): newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True) if ctwin.cChloroplast.checkState(): newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True) newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True) if ctwin.cUnknown.checkState(): newexp=hs.filtertaxonomy(newexp,'nknown',exclude=True) if ctwin.cBacteria.checkState(): newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True) newexp=hs.normalizereads(newexp) newexp.studyname=cexp.studyname+'.ct' self.addexp(newexp)
def changemapval(expdat,newfield,newval,oldfield,vals,inplace=False): """ change values of a field in the mapping file according to another field input: expdat : Experiment newfield : name of the field to change the values in (from addmapfield?) newval : the new value to put oldfield : the field with the values to test vals : a list of values, so newfield is set to newval only if the the value of oldfield is in the list inplace : bool True to overwrite current experiment, False (default) to copy """ if inplace: newexp=expdat else: newexp=hs.copyexp(expdat) for csamp in newexp.samples: if newexp.smap[csamp][oldfield] in vals: newexp.smap[csamp][newfield]=newval return newexp
def cleantaxonomy(expdat,mitochondria=True,chloroplast=True,bacteria=True,unknown=True,exclude=True): """ remove common non-16s sequences from the experiment and renormalize input: expdat : Experiment mitochondria : bool remove mitochondrial sequences chloroplast : bool remove chloroplast sequences bacteria : bool remove sequences only identified as "Bacteria" (no finer identification) unknown : bool remove unknown sequences exclude : bool True (default) to remove these sequecnes, False to keep them and throw other output: newexp : Experiment the renormalized experiment without these bacteria """ params=locals() newexp=hs.copyexp(expdat) if mitochondria: if exclude: newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True) else: ne1=hs.filtertaxonomy(newexp,'mitochondria',exclude=False) if chloroplast: if exclude: newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True) newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True) else: ne2=hs.filtertaxonomy(newexp,'Streptophyta',exclude=False) ne3=hs.filtertaxonomy(newexp,'Chloroplast',exclude=False) if unknown: if exclude: newexp=hs.filtertaxonomy(newexp,'Unknown',exclude=True) newexp=hs.filtertaxonomy(newexp,'Unclassified;',exclude=True,exact=True) else: ne4=hs.filtertaxonomy(newexp,'Unknown',exclude=False) ne5=hs.filtertaxonomy(newexp,'Unclassified;',exclude=False,exact=True) if bacteria: if exclude: newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True) else: ne6=hs.filtertaxonomy(newexp,'Bacteria;',exclude=False,exact=True) if exclude: newexp=hs.normalizereads(newexp) else: allseqs=[] allseqs+=(ne1.seqs) allseqs+=(ne2.seqs) allseqs+=(ne3.seqs) allseqs+=(ne4.seqs) allseqs+=(ne5.seqs) allseqs+=(ne6.seqs) allseqs=list(set(allseqs)) newexp=hs.filterseqs(newexp,allseqs) newexp.filters.append('Clean Taxonomy (remove mitochondria etc.)') hs.addcommand(newexp,"cleantaxonomy",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d sequences before filtering, %d after' % (len(expdat.seqs),len(newexp.seqs))) return newexp
def filtersimilarsamples(expdat,field,method='mean'): """ join similar samples into one sample (i.e. to remove samples of same individual) input: expdat : Experiment field : string Name of the field containing the values (for which similar values will be joined) method : string What to do with samples with similar value. options: 'mean' - replace with a sample containing the mean of the samples 'median'- replace with a sample containing the median of the samples 'random' - replace with a single random sample out of these samples 'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated 'fracpres' - replace with fraction of samples where the bacteria is present output: newexp : Experiment like the input experiment but only one sample per unique value in field """ params=locals() newexp=hs.copyexp(expdat) if method=='sum': newexp=hs.toorigreads(newexp) uvals=hs.getfieldvals(expdat,field,ounique=True) keep=[] for cval in uvals: cpos=hs.findsamples(expdat,field,cval) if len(cpos)==1: keep.append(cpos[0]) continue if method=='random': keep.append(cpos[np.random.randint(len(cpos))]) continue # set the mapping file values cmap=expdat.smap[expdat.samples[cpos[0]]] for ccpos in cpos[1:]: for cfield in cmap.keys(): if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]: cmap[cfield]='NA' if method=='mean': cval=np.mean(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='median': cval=np.median(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='sum': cval=np.sum(newexp.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos)) keep.append(cpos[0]) elif method=='fracpres': cval=np.sum(expdat.data[:,cpos]>0,axis=1) newexp.data[:,cpos[0]]=cval/len(cpos) keep.append(cpos[0]) else: hs.Debug(9,'method %s not supported' % method) return False newexp.smap[expdat.samples[cpos[0]]]=cmap newexp=hs.reordersamples(newexp,keep) if method=='sum': newexp=hs.normalizereads(newexp) newexp.filters.append('Filter similar samples field %s method %s' % (field,method)) hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp
def plotexp(exp,sortby=False,numeric=False,minreads=4,rangeall=False,seqdb=None,cdb=None,showline=True,ontofig=False,usegui=True,showxall=False,showcolorbar=False,ptitle=False,lowcutoff=1,uselog=True,showxlabel=True,colormap=False,colorrange=False): """ Plot an experiment input: exp - from load() sortby - name of mapping file field to sort by or Flase to not sort numeric - True if the field is numeric minreads - minimum number of reads per bacteria in order to show it or 0 to show all rangeall - True to show all frequencies in image scale, false to saturate at 10% seqdb - the SRBactDB database (from bactdb.load) cdb - the cool sequences database (from cooldb.load) showline - if True plot lines between category values ontofig - name of ontology to plot for bactdb or false to no plot usegui - True use a gui for otu summary, False just print showxall - True to show all sample names when not sorting, False to show no more than 10 showcolorbar - True to plot the colorbar. False to not plot ptitle - name of the figure or False to show processing history as name lowcutoff - minimal value for read (for 0 log transform) - the minimal resolution - could be 10000*2/origreads showxlabel : bool True to show the x label (default), False to hide it colormap : string or False name of colormap or False (default) to use mpl default colormap colorrange : [min,max] or False [min,max] to set the colormap range, False to use data min,max (default) as specified in rangeall output: newexp - the plotted experiment (sorted and filtered) ax - the plot axis """ hs.Debug(1,"Plot experiment %s" % exp.studyname) hs.Debug(1,"Commands:") for ccommand in exp.commands: hs.Debug(1,"%s" % ccommand) vals=[] if sortby: hs.Debug(1,"Sorting by field %s" % sortby) for csamp in exp.samples: vals.append(exp.smap[csamp][sortby]) if numeric: hs.Debug(1,"(numeric sort)") vals=hs.tofloat(vals) svals,sidx=hs.isort(vals) newexp=hs.reordersamples(exp,sidx) else: hs.Debug(1,"No sample sorting") svals=hs.getfieldvals(exp,'#SampleID') newexp=hs.copyexp(exp) hs.Debug(1,"Filtering min reads. original bacteria - %d" % len(newexp.seqs)) if minreads>0: newexp=hs.filterminreads(newexp,minreads,logit=uselog) hs.Debug(1,"New number of bacteria %d" % len(newexp.seqs)) newexp.seqdb=seqdb newexp.cdb=cdb # ldat=ldat[:,sidx] ldat=newexp.data if uselog: hs.Debug(1,"Using log, cutoff at %f" % lowcutoff) ldat[np.where(ldat<lowcutoff)]=lowcutoff ldat=np.log2(ldat) oldparams=plt.rcParams mpl.rc('keymap',back='c, backspace') mpl.rc('keymap',forward='v') mpl.rc('keymap',all_axes='A') f=figure() # set the colormap to default if not supplied if not colormap: colormap=plt.rcParams['image.cmap'] # plot the image if colorrange: hs.Debug(1,"colormap range is 0,10") iax=imshow(ldat,interpolation='nearest',aspect='auto',clim=colorrange,cmap=plt.get_cmap(colormap)) elif rangeall: hs.Debug(1,"colormap range is all") iax=imshow(ldat,interpolation='nearest',aspect='auto',cmap=plt.get_cmap(colormap)) else: hs.Debug(1,"colormap range is 0,10") iax=imshow(ldat,interpolation='nearest',aspect='auto',clim=[0,10],cmap=plt.get_cmap(colormap)) if not ptitle: hs.Debug(1,"Showing filters in title") if (len(newexp.filters))>4: cfilters=[newexp.filters[0],'...',newexp.filters[-2],newexp.filters[-1]] else: cfilters=newexp.filters cfilters=hs.clipstrings(cfilters,30) ptitle='\n'.join(cfilters) title(ptitle,fontsize=10) ax=iax.get_axes() ax.autoscale(False) if showline: hs.Debug(1,"Showing lines") labs=[] labpos=[] linepos=[] minpos=0 svals.append('end') for idx,cval in enumerate(svals[:-1]): if cval==svals[idx+1]: continue labpos.append(minpos-0.5+float(idx+1-minpos)/2) minpos=idx+1 linepos.append(idx+0.5) labs.append(cval) hs.Debug(1,"number of lines is %d" % len(linepos)) if showxlabel: ax.set_xticks(labpos) ax.set_xticklabels(labs,rotation=45,ha='right') for cx in linepos: plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'k',linewidth=2) else: hs.Debug(1,"Not showing lines") if showxall or len(newexp.samples)<=10: hs.Debug(1,"less than 10 samples, showing all sample names") ax.set_xticklabels(svals,rotation=90) ax.set_xticks(range(len(newexp.samples))) tight_layout() ax.set_ylim(-0.5,np.size(ldat,0)+0.5) if showcolorbar: hs.Debug(1,"Showing colorbar") cb=colorbar(ticks=list(np.log2([2,10,100,500,1000]))) cb.ax.set_yticklabels(['<0.02%','0.1%','1%','5%','>10%']) # create the plot ax.expdat=newexp ax.lastselect=-1 ax.sampline='' ax.ofig=f ax.labelson=False ax.labelnames=[] f.canvas.mpl_connect('button_press_event', onplotmouseclick) f.canvas.mpl_connect('key_press_event', onplotkeyclick) # show() plt.rcParams=oldparams # if want the ontology analysis for a given category: if ontofig: hs.Debug(1,"Ontofig is set") newexp.ontofigname=ontofig else: newexp.ontofigname=False # if we want gui, open it if usegui: hs.Debug(1,"Using the GUI window") import heatsequer.plots.plotwingui guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp) # from heatsequer.plots import plotwingui # guiwin = plotwingui.PlotGUIWindow(newexp) ax.guiwin=guiwin guiwin.plotfig=f guiwin.plotax=ax guiwin.show() else: ax.guiwin=False hs.Debug(7,'Not using gui') if newexp.plotmetadata: hs.Debug(1,"Experiment has metadata attached for plotting (%d points)" % len(newexp.plotmetadata)) for cmet in newexp.plotmetadata: addplotmetadata(newexp,field=cmet[0],value=cmet[1],color=cmet[2],inverse=cmet[3],beforesample=cmet[4]) show() return newexp,ax
def plotexp(exp,sortby=False,numeric=False,minreads=4,rangeall=False,seqdb=None,cdb=None,showline=True,ontofig=False,usegui=True,showxall=False,showcolorbar=False,ptitle=False,lowcutoff=1,uselog=True,showxlabel=True,colormap=False,colorrange=False,linewidth=2,subline='',showhline=True,newfig=True,fixfont=False,fontsize=None,nosort=False,zeroisnone=False,xlabelrotation=45,showtaxnames=False): """ Plot an experiment input: exp - from load() sortby - name of mapping file field to sort by or Flase to not sort numeric - True if the field is numeric minreads - minimum number of reads per bacteria in order to show it or 0 to show all rangeall - True to show all frequencies in image scale, false to saturate at 10% seqdb - the SRBactDB database (from bactdb.load) cdb - the cool sequences database (from cooldb.load), or None (default) to use the heatsequer loaded cdb showline - if True plot lines between category values ontofig - name of ontology to plot for bactdb or false to no plot usegui - True use a gui for otu summary, False just print showxall - True to show all sample names when not sorting, False to show no more than 10 showcolorbar - True to plot the colorbar. False to not plot ptitle : str (optional) '' to show o show processing history as name, None to not show title, or str of name of the figure lowcutoff - minimal value for read (for 0 log transform) - the minimal resolution - could be 10000*2/origreads showxlabel : bool True to show the x label (default), False to hide it colormap : string or False name of colormap or False (default) to use mpl default colormap colorrange : [min,max] or False [min,max] to set the colormap range, False to use data min,max (default) as specified in rangeall subline : str Name of category for subline plotting or '' (Default) for no sublines showhline : bool True (default) to plot the horizontal lines listed in exp.hlines. False to not plot them newfig : bool True (default) to open figure in new window, False to use current fixfont : bool (optional) False (default) to use fixedfont, True to use fixed width font fontsize : int or None (optional) None (default) to use default font size, number to use that font size nosort : bool (optional) False (default) to sort by the sort field, True to skip the sorting zeroisnone : bool (optional) False (default) to plot zeros as 0, True to assign None (white color) xlabelrotation : int (optional) the rotation of the xtick labels showtaxnames : book (optional) False (default) to not show tax names (need to press 'h' to show) True to show the taxonomy names output: newexp - the plotted experiment (sorted and filtered) ax - the plot axis """ hs.Debug(1,"Plot experiment %s" % exp.studyname) hs.Debug(1,"Commands:") for ccommand in exp.commands: hs.Debug(1,"%s" % ccommand) if exp.sparse: hs.Debug(9,'Sparse matrix - converting to dense') exp=hs.copyexp(exp,todense=True) vals=[] if cdb is None: cdb=hs.cdb if seqdb is None: seqdb=hs.bdb if sortby: if not nosort: hs.Debug(1,"Sorting by field %s" % sortby) for csamp in exp.samples: vals.append(exp.smap[csamp][sortby]) if numeric: hs.Debug(1,"(numeric sort)") vals=hs.tofloat(vals) svals,sidx=hs.isort(vals) newexp=hs.reordersamples(exp,sidx) else: hs.Debug(1,"no sorting but showing columns") svals=hs.getfieldvals(exp,sortby) newexp=hs.copyexp(exp) else: hs.Debug(1,"No sample sorting") svals=hs.getfieldvals(exp,'#SampleID') newexp=hs.copyexp(exp) hs.Debug(1,"Filtering min reads. original bacteria - %d" % len(newexp.seqs)) if minreads>0: newexp=hs.filterminreads(newexp,minreads,logit=uselog) hs.Debug(1,"New number of bacteria %d" % len(newexp.seqs)) newexp.seqdb=seqdb newexp.cdb=cdb newexp.scdb=hs.scdb # if usegui: # hs.Debug(1,"Using the GUI window") # import heatsequer.plots.plotwingui # from PyQt4 import QtGui # app = QtGui.QApplication(sys.argv) # guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp) # ldat=ldat[:,sidx] ldat=newexp.data if zeroisnone: ldat[ldat==0]=None if uselog: hs.Debug(1,"Using log, cutoff at %f" % lowcutoff) ldat[np.where(ldat<lowcutoff)]=lowcutoff ldat=np.log2(ldat) oldparams=plt.rcParams mpl.rc('keymap',back='c, backspace') mpl.rc('keymap',forward='v') mpl.rc('keymap',all_axes='A') if newfig: f=plt.figure(tight_layout=True) else: f=plt.gcf() # set the colormap to default if not supplied if not colormap: colormap=plt.rcParams['image.cmap'] # plot the image if colorrange: hs.Debug(1,"colormap range is 0,10") iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',clim=colorrange,cmap=plt.get_cmap(colormap)) elif rangeall: hs.Debug(1,"colormap range is all") iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',cmap=plt.get_cmap(colormap)) else: hs.Debug(1,"colormap range is 0,10") iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',clim=[0,10],cmap=plt.get_cmap(colormap)) if ptitle is not None: if not ptitle: hs.Debug(1,"Showing filters in title") if (len(newexp.filters))>4: cfilters=[newexp.filters[0],'...',newexp.filters[-2],newexp.filters[-1]] else: cfilters=newexp.filters cfilters=hs.clipstrings(cfilters,30) ptitle='\n'.join(cfilters) plt.title(ptitle,fontsize=10) ax=iax.get_axes() ax.autoscale(False) # plot the sublines (smaller category lines) if subline: slval=hs.getfieldvals(newexp,subline) prevval=slval[0] for idx,cval in enumerate(slval): if cval!=prevval: xpos=idx-0.5 plt.plot([xpos,xpos],[-0.5,np.size(ldat,0)-0.5],'w:') prevval=cval if showline: hs.Debug(1,"Showing lines") labs=[] labpos=[] linepos=[] minpos=0 svals.append('end') for idx,cval in enumerate(svals[:-1]): if cval==svals[idx+1]: continue labpos.append(minpos-0.5+float(idx+1-minpos)/2) minpos=idx+1 linepos.append(idx+0.5) labs.append(cval) hs.Debug(1,"number of lines is %d" % len(linepos)) if showxlabel: ax.set_xticks(labpos) ax.set_xticklabels(labs,rotation=xlabelrotation,ha='right') for cx in linepos: plt.plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'k',linewidth=linewidth) plt.plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'w:',linewidth=linewidth) else: hs.Debug(1,"Not showing lines") if showxall or len(newexp.samples)<=10: hs.Debug(1,"less than 10 samples, showing all sample names") ax.set_xticklabels(svals,rotation=90) ax.set_xticks(range(len(newexp.samples))) # f.tight_layout() ax.set_ylim(-0.5,np.size(ldat,0)-0.5) if fixfont: fontProperties = {'family':'monospace'} ax.set_yticklabels(ax.get_yticks(), fontProperties) if showcolorbar: hs.Debug(1,"Showing colorbar") cb=plt.colorbar(ticks=list(np.log2([2,10,100,500,1000]))) cb.ax.set_yticklabels(['<0.02%','0.1%','1%','5%','>10%']) # create the plot ax.expdat=newexp ax.lastselect=-1 ax.sampline='' ax.ofig=f ax.labelson=False ax.labelnames=[] f.canvas.mpl_connect('button_press_event', onplotmouseclick) f.canvas.mpl_connect('key_press_event', onplotkeyclick) # show() plt.rcParams=oldparams # if want the ontology analysis for a given category: if ontofig: hs.Debug(1,"Ontofig is set") newexp.ontofigname=ontofig else: newexp.ontofigname=False # if we want gui, open it if usegui: hs.Debug(1,"Using the GUI window") import heatsequer.plots.plotwingui # from PyQt4 import QtGui # app = QtGui.QApplication(sys.argv) guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp) from heatsequer.plots import plotwingui guiwin = plotwingui.PlotGUIWindow(newexp) ax.guiwin=guiwin guiwin.plotfig=f guiwin.plotax=ax guiwin.show() else: ax.guiwin=False hs.Debug(7,'Not using gui') ax.plot_labelsize=fontsize if newexp.plotmetadata: hs.Debug(1,"Experiment has metadata attached for plotting (%d points)" % len(newexp.plotmetadata)) for cmet in newexp.plotmetadata: addplotmetadata(newexp,field=cmet[0],value=cmet[1],color=cmet[2],inverse=cmet[3],beforesample=cmet[4]) if showhline: if newexp.hlines: for cpos in newexp.hlines: plt.plot([0,np.shape(newexp.data)[1]],[cpos-0.5,cpos-0.5],'g') plt.show() if showtaxnames: showtaxonomies(newexp,ax,showdb=False,showcontam=False) # if usegui: # app.exec_() return newexp,ax
def joinexperiments(exp1,exp2,missingval='NA',origfieldname='origexp',addbefore=False): """ join 2 experiments into a new experiment. adding a new field origfieldname input: exp1,exp2 - the experiments to join missingval - string to put when field not in mapping file of one of the experiments origfieldname - name of the new field to add which contains the original experiment name addbefore : bool (optional) False (default) to add '-1'/'-2' after sampleid if similar ids in both experiments True to add '-1'/'-2' after sampleid if similar ids in both experiments """ params=locals() # test if same sampleid exists in both experiments. if so, add "-1" and "-2" to sampleid samp1=set(exp1.samples) samp2=set(exp2.samples) if len(samp1.intersection(samp2))>0: hs.Debug(6,'same sampleID - renaming samples') exp1=hs.renamesamples(exp1,'-1',addbefore=addbefore) exp2=hs.renamesamples(exp2,'-2',addbefore=addbefore) # join the sequences of both experiments # ASSUMING SAME SEQ LENGTH!!!! allseqs=list(set(exp1.seqs) | set(exp2.seqs)) alldict={} alltax=[] allids=[] for idx,cseq in enumerate(allseqs): alldict[cseq]=idx # make the new joined data for each experiment dat1=np.zeros((len(allseqs),np.size(exp1.data,1))) for idx,cseq in enumerate(allseqs): if cseq in exp1.seqdict: dat1[idx,:]=exp1.data[exp1.seqdict[cseq],:] alltax.append(exp1.tax[exp1.seqdict[cseq]]) allids.append(exp1.sids[exp1.seqdict[cseq]]) else: alltax.append(exp2.tax[exp2.seqdict[cseq]]) allids.append(exp2.sids[exp2.seqdict[cseq]]) dat2=np.zeros((len(allseqs),np.size(exp2.data,1))) for idx,cseq in enumerate(allseqs): if cseq in exp2.seqdict: dat2[idx,:]=exp2.data[exp2.seqdict[cseq],:] newexp=hs.copyexp(exp1) # concatenate the reads newexp.data=np.concatenate((dat1,dat2), axis=1) newexp.seqdict=alldict newexp.seqs=allseqs newexp.tax=alltax newexp.sids=allids newexp.sids=newexp.seqs newexp.samples = list(exp1.samples) + list(exp2.samples) newexp.origreads=exp1.origreads+exp2.origreads newexp.scalingfactor=np.hstack([exp1.scalingfactor,exp2.scalingfactor]) newexp.fields=list(set(exp1.fields+exp2.fields)) for cfield in newexp.fields: if cfield in exp1.fields: continue for csamp in exp1.samples: newexp.smap[csamp][cfield]=missingval for csamp in exp2.samples: newexp.smap[csamp]={} for cfield in newexp.fields: if cfield in exp2.fields: newexp.smap[csamp][cfield]=exp2.smap[csamp][cfield] else: newexp.smap[csamp][cfield]=missingval for csamp in exp1.samples: if origfieldname in exp1.fields: cname=exp1.smap[csamp][origfieldname] else: cname=exp1.studyname newexp.smap[csamp][origfieldname]=cname for csamp in exp2.samples: if origfieldname in exp2.fields: cname=exp2.smap[csamp][origfieldname] else: cname=exp2.studyname newexp.smap[csamp][origfieldname]=cname if origfieldname not in newexp.fields: newexp.fields.append(origfieldname) newexp.filters.append('joined with %s' % exp2.studyname) hs.addcommand(newexp,"joinexperiments",params=params,replaceparams={'exp1':exp1,'exp2':exp2}) return newexp
def filterwave(expdat,field=False,numeric=True,minfold=2,minlen=3,step=1,direction='up',posloc='start'): """ filter bacteria, keeping only ones that show a consecutive region of samples with higher/lower mean than other samples Done by scanning all windowlen/startpos options for each bacteria input: expdat : Experiment field : string The field to sort by or False to skip sorting numeric : bool For the sorting according to field (does not matter if field is False) minfold : float The minimal fold change for the window compared to the rest in order to keep step : int The skip between tested windows (to make it faster use a larger skip) minlen : int The minimal window len for over/under expression testing direction : string 'both' - test both over and under expression in the window 'up' - only overexpressed 'down' - only underexpressed posloc : string The position to measure the beginning ('maxstart') or middle ('maxmid') of maximal wave or 'gstart' to use beginning of first window with >=minfold change output: newexp : Experiment The filtered experiment, sorted according to window start samples position """ params=locals() # sort if needed if field: newexp=hs.sortsamples(expdat,field,numeric=numeric) else: newexp=hs.copyexp(expdat) dat=newexp.data dat[dat<1]=1 dat=np.log2(dat) numsamples=len(newexp.samples) numbact=len(newexp.seqs) maxdiff=np.zeros([numbact]) maxpos=np.zeros([numbact])-1 maxlen=np.zeros([numbact]) for startpos in range(numsamples-minlen): for cwin in np.arange(minlen,numsamples-startpos,step): meanin=np.mean(dat[:,startpos:startpos+cwin],axis=1) nowin=[] if startpos>0: nonwin=np.arange(startpos-1) if startpos<numsamples: nowin=np.hstack([nowin,np.arange(startpos,numsamples-1)]) nowin=nowin.astype(int) meanout=np.mean(dat[:,nowin],axis=1) cdiff=meanin-meanout if direction=='both': cdiff=np.abs(cdiff) elif direction=='down': cdiff=-cdiff if posloc=='gstart': usepos=np.logical_and(cdiff>=minfold,maxpos==-1) maxpos[usepos]=startpos elif posloc=='start': maxpos[cdiff>maxdiff]=startpos elif posloc=='mid': maxpos[cdiff>maxdiff]=startpos+int(cwin/2) else: hs.Debug('posloc nut supported %s' % posloc) return False maxlen[cdiff>maxdiff]=cwin maxdiff=np.maximum(maxdiff,cdiff) keep=np.where(maxdiff>=minfold)[0] keeppos=maxpos[keep] si=np.argsort(keeppos) keep=keep[si] for ci in keep: hs.Debug(6,'bacteria %s startpos %d len %d diff %f' % (newexp.tax[ci],maxpos[ci],maxlen[ci],maxdiff[ci])) newexp=hs.reorderbacteria(newexp,keep) newexp.filters.append('Filter wave field=%s minlen=%d' % (field,minlen)) hs.addcommand(newexp,"filterwave",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp
def joinexperiments(exp1,exp2,missingval='NA',origfieldname='origexp'): """ join 2 experiments into a new experiment. adding a new field origfieldname input: exp1,exp2 - the experiments to join missingval - string to put when field not in mapping file of one of the experiments origfieldname - name of the new field to add which contains the original experiment name """ params=locals() # join the sequences of both experiments # ASSUMING SAME SEQ LENGTH!!!! allseqs=list(set(exp1.seqs) | set(exp2.seqs)) alldict={} alltax=[] allids=[] for idx,cseq in enumerate(allseqs): alldict[cseq]=idx # make the new joined data for each experiment dat1=np.zeros((len(allseqs),np.size(exp1.data,1))) for idx,cseq in enumerate(allseqs): if cseq in exp1.seqdict: dat1[idx,:]=exp1.data[exp1.seqdict[cseq],:] alltax.append(exp1.tax[exp1.seqdict[cseq]]) allids.append(exp1.sids[exp1.seqdict[cseq]]) else: alltax.append(exp2.tax[exp2.seqdict[cseq]]) allids.append(exp2.sids[exp2.seqdict[cseq]]) dat2=np.zeros((len(allseqs),np.size(exp2.data,1))) for idx,cseq in enumerate(allseqs): if cseq in exp2.seqdict: dat2[idx,:]=exp2.data[exp2.seqdict[cseq],:] newexp=hs.copyexp(exp1) # concatenate the reads newexp.data=np.concatenate((dat1,dat2), axis=1) newexp.seqdict=alldict newexp.seqs=allseqs newexp.tax=alltax newexp.sids=allids newexp.sids=newexp.seqs newexp.samples = list(exp1.samples) + list(exp2.samples) newexp.origreads=exp1.origreads+exp2.origreads newexp.fields=list(set(exp1.fields+exp2.fields)) for cfield in newexp.fields: if cfield in exp1.fields: continue for csamp in exp1.samples: newexp.smap[csamp][cfield]=missingval for csamp in exp2.samples: newexp.smap[csamp]={} for cfield in newexp.fields: if cfield in exp2.fields: newexp.smap[csamp][cfield]=exp2.smap[csamp][cfield] else: newexp.smap[csamp][cfield]=missingval for csamp in exp1.samples: newexp.smap[csamp][origfieldname]=exp1.studyname for csamp in exp2.samples: newexp.smap[csamp][origfieldname]=exp2.studyname newexp.fields.append(origfieldname) newexp.filters.append('joined with %s' % exp2.studyname) hs.addcommand(newexp,"joinexperiments",params=params,replaceparams={'exp1':exp1,'exp2':exp2}) return newexp