Esempio n. 1
0
def showleakage(expdat,seq,wwpp=['1','2','3','4','5','6','7','8']):
	figure()
	newexp=hs.filterseqs(expdat,[seq])
	for idx,cplate in enumerate(wwpp):
		print(cplate)
		cexp=hs.filtersamples(newexp,'primerplate_int',cplate,exact=True)
		print(len(cexp.samples))
		subplot(3,3,idx+1)
		title(cplate)
		cmat=np.empty([8,12])
		cmat[:] = np.NAN
		for idx2,csamp in enumerate(cexp.samples):
			crow=ord(cexp.smap[csamp]['Row'].lower())-ord('A'.lower())
			ccol=int(cexp.smap[csamp]['column_int'])-1
			cval=cexp.data[0,idx2]
			if cval>0:
				cval=np.log2(cval)
			else:
				cval=-5
			cmat[crow,ccol]=cval
		imshow(cmat,interpolation='nearest',aspect='auto',clim=[-5,10],cmap=plt.get_cmap("coolwarm"))
		for idx2,csamp in enumerate(cexp.samples):
			crow=ord(cexp.smap[csamp]['Row'].lower())-ord('A'.lower())
			ccol=int(cexp.smap[csamp]['column_int'])-1
			isntc=int(cexp.smap[csamp]['NTC_bool'])
			if isntc:
				text(ccol,crow,'x')
Esempio n. 2
0
def findmislabels(expdat,field,distmetric='bc'):
	""""
	find mislabelled samples according to field
	input:
	expdat : Experiment
	field : string
		name of the field to examine (i.e. subjectid)
	distmetric : string
		the distance meteric to use (see calcdist)
	"""

	expdat=hs.sortsamples(expdat,field)
	fvals=hs.getfieldvals(expdat,field)
	ufvals=list(set(fvals))
	onames=[]
	for idx,csamp in enumerate(expdat.samples):
		onames.append(csamp+';'+fvals[idx])
	omat=np.zeros([len(fvals),len(ufvals)])
	for groupidx,groupval in enumerate(ufvals):
		cexp=hs.filtersamples(expdat,field,groupval,exact=True)
		for aidx,aval in enumerate(expdat.samples):
			cdist=[]
			for gidx,gval in enumerate(cexp.samples):
				# don't measure distance to ourselves
				if gval==aval:
					continue
				cdist.append(hs.calcdist(cexp.data[:,gidx],expdat.data[:,aidx],distmetric=distmetric))
			omat[aidx,groupidx]=np.mean(cdist)
	figure()
	iax=imshow(omat,interpolation='nearest',aspect='auto')
	ax=iax.get_axes()
	ax.set_xticks(range(len(ufvals)))
	ax.set_xticklabels(ufvals,rotation=90)
	ax.set_yticks(range(len(onames)))
	ax.set_yticklabels(onames)
Esempio n. 3
0
def sortbyvariance(expdat,field=False,value=False,exact=False,norm=False):
	"""
	sort bacteria by their variance
	sorting is performed based on a subset of samples (field/val/exact) and then
	all the experiment is sorted according to them
	input:
	expdat : Experiment
	field : string
		name of the field to filter samples for freq. sorting or False for all samples
	value : string
		value of samples to use for the freq. sorting
	exact : bool
		is the value exact or partial string
	norm : bool
		- False to sort by varinace, True to sort by variance/mean
	output:
	newexp : Experiment
		the experiment with bacteria sorted according to subgroup freq.
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact)
	else:
		texp=copy.deepcopy(expdat)

	svals=np.std(texp.data,axis=1)
	if norm:
		svals=svals/np.mean(texp.data,axis=1)
	svals,sidx=hs.isort(svals)

	newexp=hs.reorderbacteria(expdat,sidx)
	newexp.filters.append("sort by variance field=%s value=%s normalize=%s" % (field,value,norm))
	hs.addcommand(newexp,"sortbyvariance",params=params,replaceparams={'expdat':expdat})
	return newexp
Esempio n. 4
0
def sortbygroupdiff(expdat,field,val1,val2):
	"""
	sort bacteria in the experiment by the difference in the mean between the 2 groups (val1,val2 in field)
	input:
	expdat
	field - the name of the field for the 2 groups
	val1,val2 - the values for the 2 groups

	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	exp1=hs.filtersamples(expdat,field,val1,exact=True)
	exp2=hs.filtersamples(expdat,field,val2,exact=True)
	m1=np.mean(np.log2(exp1.data+2),axis=1)
	m2=np.mean(np.log2(exp2.data+2),axis=1)
	diff=(m1-m2)/(m1+m2+20)
	sv,si=hs.isort(diff)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by group difference field=%s val1=%s val2=%s" % (field,val1,val2))
	hs.addcommand(newexp,"sortbygroupdiff",params=params,replaceparams={'expdat':expdat})
	return newexp
Esempio n. 5
0
def filterwinperid(expdat,idfield,field,val1,val2,mineffect=1):
	"""
	do filterfieldwave on each individual (based on idfield) and join the resulting bacteria
	"""
	params=locals()

	iseqs=[]
	uids=hs.getfieldvals(expdat,idfield,ounique=True)
	for cid in uids:
		cexp=hs.filtersamples(expdat,idfield,cid)
		texp=hs.filterfieldwave(cexp,field,val1,val2,mineffect=mineffect)
		iseqs+=texp.seqs
	iseqs=list(set(iseqs))
	newexp=hs.filterseqs(expdat,iseqs)
	return newexp
Esempio n. 6
0
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2):
	"""
	sort bacteria in the experiment based on the number of positive/negative samples
	(ignoring nans)
	input:
	expdat : Experiment
	field,value,exclude,exact : name of field and value of field in order to sort based only on these samples
		or field=False for all samples (default)
	maxfval - the maximal f-value

	output:
	newexp : Experiment
		sorted by difference between positive/negative
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=hs.copyexp(expdat)

	texp.data=np.sign(texp.data)
	numpos=np.nansum(texp.data>0,axis=1)
	numneg=np.nansum(texp.data<0,axis=1)
	pval=np.ones(len(numpos))
	for cpos in range(len(pval)):
		if numpos[cpos]==0 and numneg[cpos]==0:
			continue
		pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval[cpos]=np.min([pval1,pval2])

	signs=np.nanmean(texp.data,axis=1)

	fval=hs.fdr(pval)
	keep=np.where(np.array(fval)<=maxfval)[0]
	newexp=hs.reorderbacteria(expdat,keep)
	signs=signs[keep]
	si=np.argsort(signs)

	newexp=hs.reorderbacteria(newexp,si)
	newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval))
	hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat})
	return newexp
Esempio n. 7
0
def sortbyfreq(expdat,field=False,value=False,exact=False,exclude=False,logscale=True,useabs=False):
	"""
	sort bacteria in experiment according to frequency
	sorting is performed based on a subset of samples (field/val/exact) and then
	all the experiment is sorted according to them
	input:
	expdat : Experiment
	field : string
		name of the field to filter samples for freq. sorting or False for all samples
	value : string
		value of samples to use for the freq. sorting
	exact : bool
		is the value exact or partial string
	exclude : bool
		True to sort on all samples except the field/value ones, False to sort only on field/value samples (default=False)
	logscale : bool
		True (default) to use log2 transform for frequencies before mean and sorting, False to use original values
	useabs : bool
		True to sort by absolute value of freq, False (default) to sort by freq

	output:
	newexp : Experiment
		the experiment with bacteria sorted according to subgroup freq.
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=copy.deepcopy(expdat)
	if logscale:
		texp.data[texp.data<2]=2
		texp.data=np.log2(texp.data)
	if useabs:
		meanvals=np.mean(np.abs(texp.data),axis=1)
	else:
		meanvals=np.mean(texp.data,axis=1)
	svals,sidx=hs.isort(meanvals)

	newexp=hs.reorderbacteria(expdat,sidx)
	newexp.filters.append("sort by freq field=%s value=%s" % (field,value))
	hs.addcommand(newexp,"sortbyfreq",params=params,replaceparams={'expdat':expdat})
	return newexp
Esempio n. 8
0
def getgroupgroupdist(expdat,field,distmat,dsamp,uvals=False,subfield='host_subject_id',vmin=0,vmax=1):
	"""
	calculate the distance matrix based on groups of samples according to field but calculate seperately for each individual and then combine
	using a distance matrix and mapping
	for the Amina skin cosmetics study

	input:
	expdat : Experiment
	field : string
		name of the field to group by
	distmat : numpy 2d arrau
		the distance matrix (from calcdistmat or loaddistmat)
	dsamp : dict
		the mapping of each sample id to the distance matrix position (from calcdistmat or loaddistmat)
	uvals : string
		empty to plot all values, or a list of values to plot only them (in field)
	subfield : str
		name of the subfield so all distances are calculated seperately for each subfield value (i.e. 'host_subject_id')
	"""
	vals=hs.getfieldvals(expdat,field)
	if not uvals:
		uvals=list(set(vals))
	svals=hs.getfieldvals(expdat,subfield,ounique=True)
	omat=np.zeros([len(uvals),len(uvals)])
	numok=0
	for cval in svals:
		newexp=hs.filtersamples(expdat,subfield,cval)
#		dmap,dmapd=hs.loaddistmat(newexp,'amnon/bray_curtis_armpit-diff-log.txt')
		gdist,uvals=hs.getgroupdist(newexp,field,distmat,dsamp,plotit=False,uvals=uvals)
		gdist[np.isnan(gdist)]=0
		# print(cval)
		# print(gdist)
		if np.isnan(np.sum(np.sum(gdist))):
			continue
		omat=omat+gdist
		numok+=1
	omat=omat/numok
	# print('-----')
	# print(omat)
	plotdistheatmap(omat,uvals,vmin=vmin,vmax=vmax)
	return omat
Esempio n. 9
0
	def filtersamples(self):
		items=self.bMainList.selectedItems()
		if len(items)!=1:
			print("Need 1 item")
			return
		for citem in items:
			cname=str(citem.text())
			cexp=self.explist[cname]
			filtersampleswin = FilterSamplesWindow(cexp)
			res=filtersampleswin.exec_()
			if res==QtGui.QDialog.Accepted:
				field=str(filtersampleswin.cField.currentText())
				value=str(filtersampleswin.tValue.text())
				newname=str(filtersampleswin.tNewName.text())
				overwrite=filtersampleswin.cOverwrite.checkState()
				exclude=filtersampleswin.cExclude.checkState()
				exact=filtersampleswin.cExact.checkState()
				newexp=hs.filtersamples(cexp,field,value,exclude=exclude,exact=exact)
				if overwrite==0:
					newexp.studyname=newname
					self.addexp(newexp)
				else:
					self.replaceexp(newexp)