Beispiel #1
0
def savetobiom(expdat,filename,format='hdf5',addtax=True,useorigreads=True,exporthistory=True,logtransform=False):
	"""
	Save an experiment to a biom table
	input:
	expdat : Experiment
	filename : string
		Name of the file to save to
	format : string
		Format of the file ('hdf5','json','txt')
	addtax : bool
		True to add taxonomy metadata, False to not add
	useorigreads : bool
		True (default) to use original number of reads, False to use normalized (sum=10k)
	exporthistory : bool
		True (default) to save also the history (to filename.commands.txt)
	logtransform : bool
		True to log transform the data, false (default) to save original data
	"""
	savemap(expdat,filename+'.map.txt')
	if exporthistory:
		savecommands(expdat,filename+'.commands.txt')
	hs.Debug(1,'Saving biom table %s' % filename)
	if useorigreads:
		newexp=hs.toorigreads(expdat)
	else:
		newexp=expdat

	# if we need to log tranfrom the reads
	if logtransform:
		lowcutoff=1
		ldat=newexp.data
		ldat[np.where(ldat<lowcutoff)]=lowcutoff
		ldat=np.log2(ldat)

	tab=createbiomtablefromexp(newexp,addtax=addtax)
	if format=='hdf5':
		with biom.util.biom_open(filename, 'w') as f:
			tab.to_hdf5(f, "heatsequer")
	elif format=='json':
		with open(filename,'w') as f:
			tab.to_json("heatsequer",f)
	elif format=='txt':
		s=tab.to_tsv()
		with open(filename,'w') as f:
			f.write(s)
	else:
		hs.Debug(9,'file format not supported')
		return
	hs.Debug(6,'table saved to file %s' % filename)
	return
Beispiel #2
0
def savetobiom(expdat,filename,format='hdf5',addtax=True,useorigreads=True):
	"""
	Save an experiment to a biom table
	input:
	expdat : Experiment
	filename : string
		Name of the file to save to
	format : string
		Format of the file ('hdf5','json','txt')
	addtax : bool
		True to add taxonomy metadata, False to not add
	useorigreads : bool
		True (default) to use original number of reads, False to use normalized (sum=10k)
	"""
	savemap(expdat,filename+'.map.txt')
	hs.Debug(1,'Saving biom table %s' % filename)
	if useorigreads:
		newexp=hs.toorigreads(expdat)
	else:
		newexp=expdat
	tab=createbiomtablefromexp(newexp,addtax=addtax)
	if format=='hdf5':
		with biom.util.biom_open(filename, 'w') as f:
			tab.to_hdf5(f, "heatsequer")
	elif format=='json':
		with open(filename,'w') as f:
			tab.to_json("heatsequer",f)
	elif format=='txt':
		s=tab.to_tsv()
		with open(filename,'w') as f:
			f.write(s)
	else:
		hs.Debug(9,'file format not supported')
		return
	hs.Debug(6,'table saved to file %s' % filename)
	return
Beispiel #3
0
def subsample(expdat,numreads=10000,inplace=False):
	"""
	subsample (rarify) reads from all samples in an experiment
	input:
	expdat
	numreads - number of reads to subsample to
	inplace - true to replace current experiment

	output:
	newexp - the new subsampled experiment
	"""
	import biom

	params=locals()

	newexp=hs.filterorigreads(expdat,numreads,inplace)
	newexp=hs.toorigreads(newexp,inplace=True)

	table=biom.table.Table(newexp.data,newexp.seqs,newexp.samples)
	table=table.subsample(numreads,axis='observation')
	tids=table.ids(axis='sample')
	for idx,cid in enumerate(tids):
		if not cid==newexp.samples[idx]:
			print('problem with sample ids!!!!')
	newpos=[]
	for cseq in table.ids(axis='observation'):
		newpos.append(newexp.seqdict[cseq])
	newexp=hs.reorderbacteria(newexp,newpos,inplace=True)
	newexp.data=table.matrix_data.todense().A
	newexp=normalizereads(newexp,numreads=10000,inplace=True,fixorig=False)
	for cidx in range(len(newexp.samples)):
		newexp.origreads[cidx]=numreads
	newexp=updateorigreads(newexp)
	newexp.filters.append("subsample to %d" % numreads)
	hs.addcommand(newexp,"subsample",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #4
0
def filtersimilarsamples(expdat,field,method='mean'):
	"""
	join similar samples into one sample (i.e. to remove samples of same individual)
	input:
	expdat : Experiment
	field : string
		Name of the field containing the values (for which similar values will be joined)
	method : string
		What to do with samples with similar value. options:
		'mean' - replace with a sample containing the mean of the samples
		'median'- replace with a sample containing the median of the samples
		'random' - replace with a single random sample out of these samples
		'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated
		'fracpres' - replace with fraction of samples where the bacteria is present
	output:
	newexp : Experiment
		like the input experiment but only one sample per unique value in field
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	if method=='sum':
		newexp=hs.toorigreads(newexp)
	uvals=hs.getfieldvals(expdat,field,ounique=True)
	keep=[]
	for cval in uvals:
		cpos=hs.findsamples(expdat,field,cval)
		if len(cpos)==1:
			keep.append(cpos[0])
			continue
		if method=='random':
			keep.append(cpos[np.random.randint(len(cpos))])
			continue
		# set the mapping file values
		cmap=expdat.smap[expdat.samples[cpos[0]]]
		for ccpos in cpos[1:]:
			for cfield in cmap.keys():
				if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]:
					cmap[cfield]='NA'
		if method=='mean':
			cval=np.mean(expdat.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			keep.append(cpos[0])
		elif method=='median':
			cval=np.median(expdat.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			keep.append(cpos[0])
		elif method=='sum':
			cval=np.sum(newexp.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos))
			keep.append(cpos[0])
		elif method=='fracpres':
			cval=np.sum(expdat.data[:,cpos]>0,axis=1)
			newexp.data[:,cpos[0]]=cval/len(cpos)
			keep.append(cpos[0])
		else:
			hs.Debug(9,'method %s not supported' % method)
			return False
		newexp.smap[expdat.samples[cpos[0]]]=cmap
	newexp=hs.reordersamples(newexp,keep)
	if method=='sum':
		newexp=hs.normalizereads(newexp)
	newexp.filters.append('Filter similar samples field %s method %s' % (field,method))
	hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples)))
	return newexp