Beispiel #1
0
def filterminreads(exp,minreads,logit=True,useabs=False):
	"""
	filter away all bacteria that contain less than minreads in all samples together (out of 10k/samples)
	input:
	exp : Experiment
	minreads : float
		the minimum number of reads total for all samples (and out of 10k/sample) for a bacteria to be kept
	logit : bool
		True to add to command log, False to not (if called from another heatsequer function)
	output:
	newexp - the filtered experiment
	"""
	params=locals()

	if useabs:
		numreads=np.sum(np.abs(exp.data),axis=1)
	else:
		numreads=np.sum(exp.data,axis=1)
	keep=np.where(numreads>=minreads)
	newexp=hs.reorderbacteria(exp,keep[0])
	if logit:
		newexp.filters.append('filter min reads %d' % minreads)
		hs.addcommand(newexp,"filterminreads",params=params,replaceparams={'exp':exp})
	hs.Debug(6,'%d Bacteria left' % len(newexp.sids))
	return newexp
Beispiel #2
0
def clustersamples(exp,minreads=0):
	"""
	cluster samples in an experiment according to similar behavior
	input:
	exp :Experiment
	minreads : int
		the minimal original number of reads per sample to keep it
	output:
	newexp : Experiment
		the filtered and clustered experiment
	"""
	params=locals()

	newexp=hs.filterorigreads(exp,minreads)
	# normalize each row (bacteria) to sum 1
	dat=copy.copy(newexp.data)
	dat=np.transpose(dat)
	dat[dat<=2]=2
	dat=np.log2(dat)
	# cluster
	dm=spatial.distance.pdist(dat,metric='braycurtis')
	ll=cluster.hierarchy.single(dm)
	order=cluster.hierarchy.leaves_list(ll)

	newexp=hs.reordersamples(newexp,order)
	hs.addcommand(newexp,"clustersamples",params=params,replaceparams={'exp':exp})
	newexp.filters.append("cluster samples minreads=%d" % minreads)
	return newexp
Beispiel #3
0
def normalizeprctile(expdat,percent=80):
	"""
	normalize reads per experiment so percentile (rather than mean) will be normalized
	used to reduce effect of outliers (compositionality correction)
	note normalization is done on the same set of bacteria for all samples
	input:
	expdat : Experiment
	percent : float
		the percentile to normalize (0-100)

	output:
	newexp : Experiment
		the new normalized experiment
	"""
	params=locals()

	# select the bacteria to use - don't want to include very low freq. bacteria
	newexp=hs.filterminreads(expdat,1*len(expdat.samples))

	percvals=np.percentile(newexp.data,percent,axis=0)
#	plt.figure()
#	plt.plot(percvals)
	percvals=percvals/np.mean(percvals)
	newexp=hs.copyexp(expdat)
	for idx,samp in enumerate(expdat.samples):
		newexp.data[:,idx]=newexp.data[:,idx]*percvals[idx]
	newexp.filters.append("normalize percentile %f" % percent)
	hs.addcommand(newexp,"normalizeprctile",params=params,replaceparams={'expdat':expdat})

	return newexp
Beispiel #4
0
def toorigreads(expdat,inplace=False):
	"""
	convert the number of reads to absolute using the origreads field
	input:
	expdat
	inplace - True to replace current exp, false to create a new one

	output:
	newexp - each sample has origreads reads (instead of 10k)
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	for idx,csamp in enumerate(newexp.samples):
		totreads=np.sum(newexp.data[:,idx])
		origreads=newexp.origreads[idx]
		if totreads==0:
			continue
		ratio=float(origreads)/totreads
		newexp.data[:,idx]=newexp.data[:,idx]*ratio
	newexp.filters.append("changed reads to origread value")
	hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #5
0
def clusterbacteria(exp,minreads=0,uselog=True):
	"""
	cluster bacteria in an experiment according to similar behavior
	input:
	exp : Experiment
	minreads : int
		the minimal number of reads to keep before clustering (to make faster)
	uselog : bool
		True to log transform reads for clustering (before normalizing), false to use full reads

	output:
	newexp : Experiment
		the filtered and clustered experiment
	"""
	params=locals()

	newexp=hs.filterminreads(exp,minreads,logit=False)
	# normalize each row (bacteria) to sum 1
	dat=copy.copy(newexp.data)
	if uselog:
		dat[dat<=2]=2
		dat=np.log2(dat)
	dat=scale(dat,axis=1,copy=False)
	# cluster
	dm=spatial.distance.pdist(dat,metric='euclidean')
	ll=cluster.hierarchy.single(dm)
	order=cluster.hierarchy.leaves_list(ll)

	newexp=hs.reorderbacteria(newexp,order)
	hs.addcommand(newexp,"clusterbacteria",params=params,replaceparams={'exp':exp})
	newexp.filters.append("cluster bacteria minreads=%d" % minreads)
	return newexp
Beispiel #6
0
def clipseqs(expdat,startpos,addseq='TAC'):
	"""
	clip the first nucleotides in all sequences in experiment
	to fix offset in sequencing
	input:
	expdat
	startpos - the position to start from (0 indexed) or negative to add nucleotides
	addseq - the sequence to add (just a guess) if startpos is negative
	output:
	newexp - new experiment with all sequences clipped and joined identical sequences
	"""
	params=locals()

	newexp=copy.deepcopy(expdat)
	newseqs=[]
	newdict={}
	keeppos=[]
	for idx,cseq in enumerate(newexp.seqs):
		if startpos>=0:
			cseq=cseq[startpos:]
		else:
			cseq=addseq[:abs(startpos)]+cseq
			cseq=cseq[:len(expdat.seqs[0])]
		if cseq in newdict:
			newexp.data[newdict[cseq],:] += newexp.data[idx,:]
		else:
			newdict[cseq]=idx
			newseqs.append(cseq)
			keeppos.append(idx)
	newexp=reorderbacteria(newexp,keeppos)
	newexp.seqs=newseqs
	newexp.seqdict=newdict
	hs.addcommand(newexp,"clipseqs",params=params,replaceparams={'expdat':expdat})
	newexp.filters.append("trim %d nucleotides" % startpos)
	return newexp
Beispiel #7
0
def convertdatefield(expdat,field,newfield,timeformat='%m/%d/%y %H:%M'):
	"""
	convert a field containing date/time to a numeric (seocds since epoch) field (create a new field for that)
	input:
	expdat : Experiment
		the experiment to add the field to
	field : string
		name of the field containing the date/time format
	newfield : string
		name of the new field (with seconds since epoch)
	timeformat : string
		format of the date/time field (based on time format)
	output:
	newexp : Experiment
		the experiment with the added time since epoch field
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	newexp.fields.append(newfield)
	for csamp in newexp.samples:
		newexp.smap[csamp][newfield]=time.mktime(time.strptime(newexp.smap[csamp][field],timeformat))
	newexp.filters.append('add time field %s (based on field %s)' % (newfield,field))
	hs.addcommand(newexp,"convertdatefield",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Beispiel #8
0
def normalizereads(expdat,numreads=10000,fixorig=False,inplace=False):
	"""
	normalize the number of reads per sample (default to 10k)
	input:
	expdat
	numreads - the number of reads to normalize to
	fixorig - True to fix origreads with the same ratio, False to keep as before
	inplace - true to replace orig experiment, false to create a new experiment

	output:
	newexp - the normalized experiment
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	for idx,csamp in enumerate(newexp.samples):
		totreads=np.sum(newexp.data[:,idx])
		if totreads==0:
			continue
		ratio=float(numreads)/totreads
		newexp.data[:,idx]=newexp.data[:,idx]*ratio
		if fixorig:
			hs.Debug(2,'fixing original frequencies')
			newexp.origreads[idx]=float(newexp.origreads[idx])/ratio
	newexp.filters.append("renormalized reads to sum %d" % numreads)
	hs.addcommand(newexp,"normalizereads",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #9
0
def sortsamples(exp,field,numeric=False,logit=True):
	"""
	sort samples according to field
	input:
	exp : Experiment
	field : string
		name of the field to sort by
	numeric : bool
		True for numeric values in field, false for text
	output:
	newexp : Experiment
		the sorted experiment
	"""
	params=locals()

	fvals=hs.getfieldvals(exp,field)
	if numeric:
		fvals=hs.tofloat(fvals)
	svals,sidx=hs.isort(fvals)
	newexp=hs.reordersamples(exp,sidx)

	if logit:
		hs.addcommand(newexp,"sortsamples",params=params,replaceparams={'exp':exp})
		newexp.filters.append('sorted samples by field %s' % field)
	return newexp
Beispiel #10
0
def filterbacteriafromfile(expdat,filename,exclude=False,subseq=False):
	"""
	filter bacteria from an experiment based on a file with sequences (one per line)
	input:
	expdat
	filename - name of the sequence file (1 per line)
	exclude - remove bacteria from the file instead of keeping them
	subseq - the sequences in the file can be subsequences of the experiment sequences (different lengths). but slower.

	output:
	newexp - the filtered experiment
	"""
	params=locals()

	fl=open(filename,'rU')
	seqs=[]
	for cline in fl:
		seqs.append(cline.strip())
	newexp=hs.filterseqs(expdat,seqs,exclude=exclude,subseq=False)
	filt='Filter sequences from file '+filename
	if exclude:
		filt+=' (Exclude)'
	if subseq:
		filt+=' (subseq)'
	newexp.filters.append(filt)
	hs.addcommand(newexp,"filterbacteriafromfile",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #11
0
def filterannotations(expdat,annotation,cdb=None,exclude=False):
	"""
	filter keeping only samples which have annotation in their cooldb description
	input:
	expdat
	annotation - substring of the annotation (case insensitive)
	cdb - the database of cool sequences (from cooldb.load()) or None (default) to use the heatsequer loaded cdb
	exclude - False to keep matching bacteria, True to remove matching bacteria

	output:
	newexp - the filtered experiment
	"""
	params=locals()

	if cdb is None:
		cdb=hs.cdb
	keeplist=[]
	for idx,cseq in enumerate(expdat.seqs):
		keep=False
		info=hs.cooldb.getseqinfo(cdb,cseq)
		for cinfo in info:
			if annotation.lower() in str(cinfo).lower():
				keep=True
		if exclude:
			keep = not keep
		if keep:
			keeplist.append(idx)
	newexp=hs.reorderbacteria(expdat,keeplist)
	newexp.filters.append('Filter annotations %s' % annotation)
	hs.addcommand(newexp,"filterannotations",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d bacteria found' % len(keeplist))
	return newexp
Beispiel #12
0
def filterknownbact(expdat,cdb=None,exclude=False):
	"""
	filter keeping only bacteria which we know about in cooldb
	input:
	expdat : Experiment
	cdb : cooldb
		the manual annotation database (fromn cooldb.loaddb)
	exclude : bool
		True to throw away known bacteria, False to keep only them
	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	if cdb is None:
		cdb=hs.cdb
	known=[]
	for idx,cseq in enumerate(expdat.seqs):
		if len(hs.cooldb.getseqinfo(cdb,cseq))>0:
			known.append(idx)
	hs.Debug(2,'Found %d sequences known in cooldb' % len(known))
	if exclude:
		known=set(range(len(expdat.seqs))).difference(known)
	newexp=hs.reorderbacteria(expdat,known)
	if not exclude:
		newexp.filters.append('filter cooldb known bacteria')
	else:
		newexp.filters.append('filter exclude cooldb known bacteria')
	hs.Debug(6,'%d bacteria left' % len(newexp.sids))
	newexp.filters.append('keep only sequences from cooldb')
	hs.addcommand(newexp,"filterknownbact",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #13
0
def normalizebyseqs(expdat,seqs,exclude=False,fixorig=True):
	"""
	normalize experiment by making the sum of frequencies in seqs constant in each sample
	input:
	expdat
	seqs - the sequences to use as the normalization factor (sum of the sequences)
	exclude - true to use all sequences except in seqs as the normalization factor, False to use seqs
	fixorig - True to modify the origreads field, false to leave it as it was
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	spos=[]
	for cseq in seqs:
		spos.append(expdat.seqdict[cseq])
	if exclude:
		spos=np.setdiff1d(np.arange(len(expdat.seqs)),spos)
	ssum=np.sum(expdat.data[spos,:],axis=0)+0.0
	ssum[ssum==0]=1
	frat=ssum/np.mean(ssum)
	for idx in range(len(expdat.samples)):
		newexp.data[:,idx]=newexp.data[:,idx]/frat[idx]
		if fixorig:
			newexp.origreads[idx]=newexp.origreads[idx]/frat[idx]
			newexp.scalingfactor[idx]=newexp.scalingfactor[idx]*frat
	filt='Normalize By Seqs '
	if len(spos)==1:
		filt+=newexp.tax[spos[0]]
	else:
		filt+=str(len(spos))
	if exclude:
		filt+=' Exclude'
	newexp.filters.append(filt)
	hs.addcommand(newexp,"normalizebyseqs",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #14
0
def filterid(expdat,sids,exclude=False):
	"""
	filter bacteria keeping only ones in sids
	input:
	expdat : Experiment
	sids : list of integers
		the list of (hashed) sequence ids
	exclude : bool
		False to keep these bacteria, True to filter away
	output:
	newexp : Experiment
		the filtered experiment
	"""
	params=locals()

	if not type(sids) is list:
		sids=[sids]
	keep=[]
	hs.Debug(1,'filter ids',sids)
	for cid in sids:
		for idx,tid in enumerate(expdat.sids):
			if tid==cid:
				keep.append(idx)
	if exclude:
		keep=set(range(len(expdat.sids))).difference(keep)
	keep=list(set(keep))
	hs.Debug(1,'keep pos',keep)
	newexp=hs.reorderbacteria(expdat,keep)
	if exclude:
		newexp.filters.append('Filter %d ids (exclude)' % len(sids))
	else:
		newexp.filters.append('Filter %d ids' % len(sids))
	hs.addcommand(newexp,"filterid",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #15
0
def trimfieldnames(expdat,field,newfield,trimlen=6):
	"""
	trim experiment per sample field values to trimlen

	input:
	expdat: Experiment
	field : str
		name of the field to trim the values in
	newfield : str
		name of the field where to keep the trimmed values
	trimlen : int
		>0 : trim keeping first trimlen chars
		<0 : trim keeping last -trimlen chars
	output:
	newexo : Experiment
		with trimmed field values
	"""
	params=locals()

	for csamp in expdat.samples:
		cstr=expdat.smap[csamp][field]
		if trimlen>0:
			cstr=cstr[:trimlen]
		else:
			cstr=cstr[trimlen:]
		expdat.smap[csamp][newfield]=cstr
	expdat.fields.append(newfield)

	expdat.filters.append('Trim field names field %s trimlen %d' % (field,trimlen))
	hs.addcommand(expdat,"trimfieldnames",params=params,replaceparams={'expdat':expdat})
	return expdat
Beispiel #16
0
def sortbyvariance(expdat,field=False,value=False,exact=False,norm=False):
	"""
	sort bacteria by their variance
	sorting is performed based on a subset of samples (field/val/exact) and then
	all the experiment is sorted according to them
	input:
	expdat : Experiment
	field : string
		name of the field to filter samples for freq. sorting or False for all samples
	value : string
		value of samples to use for the freq. sorting
	exact : bool
		is the value exact or partial string
	norm : bool
		- False to sort by varinace, True to sort by variance/mean
	output:
	newexp : Experiment
		the experiment with bacteria sorted according to subgroup freq.
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact)
	else:
		texp=copy.deepcopy(expdat)

	svals=np.std(texp.data,axis=1)
	if norm:
		svals=svals/np.mean(texp.data,axis=1)
	svals,sidx=hs.isort(svals)

	newexp=hs.reorderbacteria(expdat,sidx)
	newexp.filters.append("sort by variance field=%s value=%s normalize=%s" % (field,value,norm))
	hs.addcommand(newexp,"sortbyvariance",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #17
0
def toorigreads(expdat,inplace=False):
	"""
	convert the number of reads to absolute using the origreads field
	input:
	expdat
	inplace - True to replace current exp, false to create a new one

	output:
	newexp - each sample has origreads reads (instead of 10k)
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	newexp.data=hs.multvec(newexp.data,newexp.scalingfactor)
	newexp.data=np.round(newexp.data)
	newexp.data=newexp.data.astype(int)
	newexp.scalingfactor=1

	newexp.filters.append("changed reads to origread value")
	hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #18
0
def sortbacteria(exp,inplace=False,logit=True):
	"""
	sort bacteria according to taxonomy (alphabetically)

	input:
	exp : experiment
		the experiment to sort
	inplace : bool
		True to sort in place (replace current experiment), False to create a new experiment
	logit : bool
		True to add to command log, False to skip (if called from other heatsequer function)

	output:
		newexp : experiment
			The sorted experiment (by taxonomy name)
	"""
	params=locals()

	tax=exp.tax
	svals,sidx=hs.isort(tax)
	newexp=hs.reorderbacteria(exp,sidx,inplace=inplace)
	if logit:
		newexp.filters.append('sorted bacteria by taxonomy')
		hs.addcommand(newexp,"sortbacteria",params=params,replaceparams={'exp':exp})
	return newexp
Beispiel #19
0
def samplemeanpervalue(expdat,field):
	"""
	create a new experiment, with 1 sample per value in field, containing the mean of all samples with that value

	input:
	expdat : Experiment
	field : string
		the field to use (i.e. 'ENV_MATTER')

	output:
	newexp : Experiment
		The new experiment with 1 sample per unique value of field
	"""
	params=locals()

	uvals=hs.getfieldvals(expdat,field,ounique=True)
	vals=hs.getfieldvals(expdat,field,ounique=False)

	vdict=hs.listtodict(vals)
	nsamps=[]
	for cval in uvals:
		nsamps.append(vdict[cval][0])
	newexp=hs.reordersamples(expdat,nsamps)
	for idx,cval in enumerate(uvals):
		cdat=expdat.data[:,vdict[cval]]
		mv=np.mean(cdat,axis=1)
		newexp.data[:,idx]=mv
	newexp.filters.append('samplemeanpervalue for field %s' % field)
	hs.addcommand(newexp,"samplemeanpervalue",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Beispiel #20
0
def sortbycentermass(expdat,field=False,numeric=True,uselog=True):
	"""
	sort bacteria in the experiment according to a 1d gradient by calculating the center of mass
	input:
	expdat
	field : string
		the name of the field to sort by or False to skip sorting
	numeric : bool
		True if the sort field is numeric (ignored if no sort field)
	uselog : bool
		True to log transform the data before mass center calculation
	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)
	dat=newexp.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	cm=[]
	multpos=np.arange(len(newexp.samples))
	for cseqind in range(len(newexp.seqs)):
		cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:]))
	sv,si=hs.isort(cm)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog))
	hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #21
0
def fieldtobact(expdat,field,bactname='',meanreads=1000,cutoff=0):
	"""
	convert values in a map file field to a new bacteria (to facilitate numeric analysis)
	input:
	expdat : Experiment
	field : string
		name of the field to convert
	bactname : string
		name of the new bacteria (empty to have similar to field name)
	meanreads : int
		the mean number of reads for the new field bacteria
	cutoff : int
		the minimal value of the field per sample (otherwise replace with meanreads)

	output:
	newexp : Experiment
		with added bacteria with the field vals as reads
	"""
	params=locals()

	if len(bactname)==0:
		bactname=field
	fv=hs.getfieldvals(expdat,field)
	vals=np.array(hs.tofloat(fv))
	okpos=np.where(vals>=cutoff)[0]
	badpos=np.where(vals<cutoff)[0]
	scalefactor=np.mean(vals[okpos])
	vals[okpos]=(vals[okpos]/scalefactor)*meanreads
	vals[badpos]=meanreads
	newexp=hs.copyexp(expdat)
	hs.insertbacteria(newexp,vals,bactname,bactname,logit=False)
	newexp.filters.append('add bacteria from map field %s' % field)
	hs.addcommand(newexp,"fieldtobact",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Beispiel #22
0
def reloadmap(expdat,mapfilename):
	"""
	reload the mapping file for a loaded experiment

	input:
	expdat : Experiment
	mapfilename : string
		Name of the mapping file to reload

	output:
	newexp : Experiment
		like expdat but with fields from new map file
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	mapsamples,smap,fields,mapmd5=loadmap(mapfilename)
	newexp.smap=smap
	newexp.fields=fields
	newexp.mapmd5=mapmd5
	for csamp in newexp.samples:
		if csamp not in mapsamples:
			hs.Debug(7,'Sample %s not in new map!' % csamp)
	newexp.filters.append('reload map %s' % mapfilename)
	hs.addcommand(newexp,"reloadmapfile",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #23
0
def filtersamples(expdat,field,filtval,exact=True,exclude=False,numexpression=False,shownumoutput=True):
	"""
	filter samples in experiment according to value in field
	input:
	exp : Experiment
	field : string
		name of the field to filter by
	filtval : string or list of strings
		the string to filter (if a list of strings, filter if any in the list)
	exact : bool
		True for exact match, False for substring
	exclude : bool
		False to keep only matching samples, True to exclude matching samples
	numexpression : bool
		True if val is a python expression, False if just a value. For an expression assume value is the beggining of the line (i.e. '<=5')
	shownumoutput : bool
		True (default) to show number of samples remaining, False to not show
	"""
	params=locals()
	if not isinstance(filtval,list):
		filtval=[filtval]

	keep=[]
	filt=''
	for cidx,csamp in enumerate(expdat.samples):
		keepit=False
		for filt in filtval:
			if numexpression:
				cval=expdat.smap[csamp][field]
				if len(cval)==0:
					continue
				if eval(cval+filt):
					keepit=True
			elif exact:
				if expdat.smap[csamp][field]==filt:
					keepit=True
			else:
				if filt in expdat.smap[csamp][field]:
					keepit=True
			# if exclude reverse the decision
		if exclude:
			keepit=not keepit
		if keepit:
			keep.append(cidx)
	newexp=hs.reordersamples(expdat,keep)
	fstr="filter data %s in %s " % (filt,field)
	if exact:
		fstr=fstr+"(exact)"
	else:
		fstr=fstr+"(substr)"
	if exclude:
		fstr+=" (exclude)"
	newexp.filters.append(fstr)
	hs.addcommand(newexp,"filtersamples",params=params,replaceparams={'expdat':expdat})
	if shownumoutput:
		hs.Debug(6,'%d Samples left' % len(newexp.samples))
	else:
		hs.Debug(1,'%d Samples left' % len(newexp.samples))
	return newexp
Beispiel #24
0
def updateorigreads(expdat,logit=True):
	params=locals()

	for idx,csamp in enumerate(expdat.samples):
		expdat.smap[csamp]['origReads']=expdat.origreads[idx]
	if logit:
		expdat.filters.append("Update orig reads")
		hs.addcommand(expdat,"updateorigreads",params=params,replaceparams={})
	return expdat
Beispiel #25
0
def filterfieldwave(expdat,field,val1,val2=False,mineffect=1,method='mean',uselog=True):
	"""
	find all sequences which show an effect size of at least mineffect between val1 and val2 samples in field
	no statistical significance testing is performed

	input:
	expdat : Experiment
	field : string
		name of field to use for group separation
	val1 : string
		value in field for group1
	val2 : string
		value in field for group2 or False for all the other samples except val1
	mineffect : float
		min difference between groups per OTU in order to keep
	method: string
		'ranksum'
	uselog : bool
		True to log transform the data

	output:
	newexp : Experiment
		only with sequences showing a mineffect difference
	"""
	params=locals()

	numseqs=len(expdat.seqs)
	numsamples=len(expdat.samples)
	dat=expdat.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	if method=='ranksum':
		for idx in range(numseqs):
			dat[idx,:]=stats.rankdata(dat[idx,:])

	pos1=hs.findsamples(expdat,field,val1)
	if val2:
		pos2=hs.findsamples(expdat,field,val2)
	else:
		pos2=np.setdiff1d(np.arange(numsamples),pos1,assume_unique=True)

	outpos=[]
	odif=[]
	for idx in range(numseqs):
		cdif=np.mean(dat[idx,pos1])-np.mean(dat[idx,pos2])
		if abs(cdif)>=mineffect:
			outpos.append(idx)
			odif.append(cdif)

	si=np.argsort(odif)
	outpos=hs.reorder(outpos,si)
	newexp=hs.reorderbacteria(expdat,outpos)
	newexp.filters.append('filterfieldwave field %s val1 %s val2 %s' % (field,val1,val2))
	hs.addcommand(newexp,"filterfieldwave",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #26
0
def addsubtrees(expdat,tree,inplace=False):
	"""
	add otus for all subtrees with the frequency being the sum of all bacteria in the subtree
	input:
	expdat - the experiment
	tree - the tree for the experiment
	inplace - if true, replace current experiment

	output:
	newexp - the new experiment with twice-1 number of otus
	"""
	params=locals()
#	if not expdat.tree:
#		hs.Debug(8,"No tree loaded for experiment")
#		return False

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	subtrees=tree.subsets()
	for csubtree in subtrees:
		newname=""
		newtax=""
		numuse=0
		newfreq=np.zeros([1,len(newexp.samples)])
		for cbact in csubtree:
			if cbact not in newexp.seqdict:
				hs.Debug(4,'sequence not in seqdict',cbact)
				continue
			numuse+=1
			cpos=newexp.seqdict[cbact]
			newfreq+=newexp.data[cpos,:]
			newname+='%d,' % cpos
			if newtax=='':
				newtax=newexp.tax[cpos]
			else:
				newtax=hs.common_start(newtax,newexp.tax[cpos])
		# add only if we have 2 bacteria or more
		if numuse>1:
			if newname not in newexp.seqdict:
				newexp,newpos=insertbacteria(newexp,freqs=newfreq,seq=newname,tax=newtax,logit=False)

	newexp.filters.append("Add subtrees")
	hs.addcommand(newexp,"addsubtrees",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Beispiel #27
0
def loadrdptax(expdat,rdpfilename,fastaname=False,threshold=60):
	"""
	load rdp taxonomy (the output of download allrank in the rdp classifier website) and add to biom table
	input:
	expdat - the biom table for which the taxonomy was assigned (sequenced were `d)
	rdpfilename - name of the saved allrank rdp assignment
	fastaname - name of fasta file used for rdp assignment (if it was not from saveseqsforrdp) or False if sequences are in the header of the fasta
	threshold - the assignemt probability threshold under which to not include the assignment (for each level)
	"""
	params=locals()

	if fastaname:
		seqs,headers=hs.readfastaseqs(fastaname)
		hdict={}
		for idx,chead in enumerate(headers):
			hdict[chead]=seqs[idx]

	fl=open(rdpfilename,'r')
	for cline in fl:
		cline=cline.rstrip()
		cdat=cline.split(';')
		# skip header lines
		if len(cdat)<2:
			continue
		# check if sequence in experiment
		cseq=cdat[0]
		if fastaname:
			if cdat[0] in hdict:
				cseq=hdict[cseq]
			else:
				hs.Debug(6,'sequence %s not found in fasta file' % cseq)
		if cseq not in expdat.seqdict:
			hs.Debug(6,'sequence %s not found in experiment' % cseq)
			continue
		cpos=expdat.seqdict[cseq]
		ctax=''
		for idx in np.arange(2,len(cdat),2):
			cp=cdat[idx+1].rstrip('%')
			if float(cp)<60:
				break
			ctax+=';'
			ctax+=cdat[idx]
		expdat.tax[cpos]=ctax
	fl.close()
	expdat.filters.append("loaded rdp taxonomy from file %s" % rdpfilename)
	hs.addcommand(expdat,"loadrdptax",params=params,replaceparams={'expdat':expdat})
	return(expdat)
Beispiel #28
0
def filtermapfields(expdat,fields=['#SampleID'],keep=True,inplace=False):
	"""
	filter fields from the experiment mapping data

	input:
	expdat : Experiment
	fields : list of str
		the list of the fields to keep/remove
	keep : bool (optional)
		True (default) to keep only the fields specified
		False to remove the fields specified
	inplace : bool (optional)
		False (default) to create new experiment
		True to replace in current experiment

	output:
	newexp : Experiment
		with only the fields requested
	"""
	params=locals()

	newsmap={}
	newfields=set(expdat.fields)
	if keep:
		newfields=newfields.intersection(set(fields))
	else:
		newfields=newfields.difference(set(fields))

	newfields.add('#SampleID')

	for csamp in expdat.samples:
		newsmap[csamp]={}
		for cfield in newfields:
			newsmap[csamp][cfield]=expdat.smap[csamp][cfield]

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)
	newexp.fields=list(newfields)
	newexp.smap=newsmap

	expdat.filters.append('filter map fields %s (keep=%s)' % (fields,keep))
	hs.addcommand(expdat,"filtermapfields",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #29
0
def clearexp(expdat):
	"""
	clear experiment from missing samples and bacteria
	remove samples with <1 reads and bacteria with total <1 reads
	input:
	expdat : Experiment
	output:
	newexp : Experiment
		the new experiment without <1 reads samples or bacteria
	"""
	params=locals()

	newexp=filterorigreads(expdat,1)
	newexp=filterminreads(expdat,1)
	newexp.filters.append('clear nonpresent bacteria and samples')
	hs.Debug(6,'%d bacteria left' % len(newexp.sids))
	hs.addcommand(newexp,"clearexp",params=params,replaceparams={'expdat':expdat})
	return newexp
Beispiel #30
0
def joinfields(expdat,field1,field2,newfield):
	"""
	join 2 fields to create a new field for each sample
	input:
	expdat : Experiment
	field1,field2 : string
		name of the 2 fields to join
	newfield : string
		name of new field to add
	"""
	params=locals()

	for csamp in expdat.samples:
		expdat.smap[csamp][newfield]=expdat.smap[csamp][field1]+';'+expdat.smap[csamp][field2]
	expdat.fields.append(newfield)
	expdat.filters.append("join fields %s, %s to new field %s" % (field1,field2,newfield))
	hs.addcommand(expdat,"joinfields",params=params,replaceparams={'expdat':expdat})
	return expdat