def savetobiom(expdat,filename,format='hdf5',addtax=True,useorigreads=True,exporthistory=True,logtransform=False): """ Save an experiment to a biom table input: expdat : Experiment filename : string Name of the file to save to format : string Format of the file ('hdf5','json','txt') addtax : bool True to add taxonomy metadata, False to not add useorigreads : bool True (default) to use original number of reads, False to use normalized (sum=10k) exporthistory : bool True (default) to save also the history (to filename.commands.txt) logtransform : bool True to log transform the data, false (default) to save original data """ savemap(expdat,filename+'.map.txt') if exporthistory: savecommands(expdat,filename+'.commands.txt') hs.Debug(1,'Saving biom table %s' % filename) if useorigreads: newexp=hs.toorigreads(expdat) else: newexp=expdat # if we need to log tranfrom the reads if logtransform: lowcutoff=1 ldat=newexp.data ldat[np.where(ldat<lowcutoff)]=lowcutoff ldat=np.log2(ldat) tab=createbiomtablefromexp(newexp,addtax=addtax) if format=='hdf5': with biom.util.biom_open(filename, 'w') as f: tab.to_hdf5(f, "heatsequer") elif format=='json': with open(filename,'w') as f: tab.to_json("heatsequer",f) elif format=='txt': s=tab.to_tsv() with open(filename,'w') as f: f.write(s) else: hs.Debug(9,'file format not supported') return hs.Debug(6,'table saved to file %s' % filename) return
def savetobiom(expdat,filename,format='hdf5',addtax=True,useorigreads=True): """ Save an experiment to a biom table input: expdat : Experiment filename : string Name of the file to save to format : string Format of the file ('hdf5','json','txt') addtax : bool True to add taxonomy metadata, False to not add useorigreads : bool True (default) to use original number of reads, False to use normalized (sum=10k) """ savemap(expdat,filename+'.map.txt') hs.Debug(1,'Saving biom table %s' % filename) if useorigreads: newexp=hs.toorigreads(expdat) else: newexp=expdat tab=createbiomtablefromexp(newexp,addtax=addtax) if format=='hdf5': with biom.util.biom_open(filename, 'w') as f: tab.to_hdf5(f, "heatsequer") elif format=='json': with open(filename,'w') as f: tab.to_json("heatsequer",f) elif format=='txt': s=tab.to_tsv() with open(filename,'w') as f: f.write(s) else: hs.Debug(9,'file format not supported') return hs.Debug(6,'table saved to file %s' % filename) return
def subsample(expdat,numreads=10000,inplace=False): """ subsample (rarify) reads from all samples in an experiment input: expdat numreads - number of reads to subsample to inplace - true to replace current experiment output: newexp - the new subsampled experiment """ import biom params=locals() newexp=hs.filterorigreads(expdat,numreads,inplace) newexp=hs.toorigreads(newexp,inplace=True) table=biom.table.Table(newexp.data,newexp.seqs,newexp.samples) table=table.subsample(numreads,axis='observation') tids=table.ids(axis='sample') for idx,cid in enumerate(tids): if not cid==newexp.samples[idx]: print('problem with sample ids!!!!') newpos=[] for cseq in table.ids(axis='observation'): newpos.append(newexp.seqdict[cseq]) newexp=hs.reorderbacteria(newexp,newpos,inplace=True) newexp.data=table.matrix_data.todense().A newexp=normalizereads(newexp,numreads=10000,inplace=True,fixorig=False) for cidx in range(len(newexp.samples)): newexp.origreads[cidx]=numreads newexp=updateorigreads(newexp) newexp.filters.append("subsample to %d" % numreads) hs.addcommand(newexp,"subsample",params=params,replaceparams={'expdat':expdat}) return newexp
def filtersimilarsamples(expdat,field,method='mean'): """ join similar samples into one sample (i.e. to remove samples of same individual) input: expdat : Experiment field : string Name of the field containing the values (for which similar values will be joined) method : string What to do with samples with similar value. options: 'mean' - replace with a sample containing the mean of the samples 'median'- replace with a sample containing the median of the samples 'random' - replace with a single random sample out of these samples 'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated 'fracpres' - replace with fraction of samples where the bacteria is present output: newexp : Experiment like the input experiment but only one sample per unique value in field """ params=locals() newexp=hs.copyexp(expdat) if method=='sum': newexp=hs.toorigreads(newexp) uvals=hs.getfieldvals(expdat,field,ounique=True) keep=[] for cval in uvals: cpos=hs.findsamples(expdat,field,cval) if len(cpos)==1: keep.append(cpos[0]) continue if method=='random': keep.append(cpos[np.random.randint(len(cpos))]) continue # set the mapping file values cmap=expdat.smap[expdat.samples[cpos[0]]] for ccpos in cpos[1:]: for cfield in cmap.keys(): if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]: cmap[cfield]='NA' if method=='mean': cval=np.mean(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='median': cval=np.median(expdat.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval keep.append(cpos[0]) elif method=='sum': cval=np.sum(newexp.data[:,cpos],axis=1) newexp.data[:,cpos[0]]=cval newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos)) keep.append(cpos[0]) elif method=='fracpres': cval=np.sum(expdat.data[:,cpos]>0,axis=1) newexp.data[:,cpos[0]]=cval/len(cpos) keep.append(cpos[0]) else: hs.Debug(9,'method %s not supported' % method) return False newexp.smap[expdat.samples[cpos[0]]]=cmap newexp=hs.reordersamples(newexp,keep) if method=='sum': newexp=hs.normalizereads(newexp) newexp.filters.append('Filter similar samples field %s method %s' % (field,method)) hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat}) hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples))) return newexp