Esempio n. 1
0
def findmislabels(expdat,field,distmetric='bc'):
	""""
	find mislabelled samples according to field
	input:
	expdat : Experiment
	field : string
		name of the field to examine (i.e. subjectid)
	distmetric : string
		the distance meteric to use (see calcdist)
	"""

	expdat=hs.sortsamples(expdat,field)
	fvals=hs.getfieldvals(expdat,field)
	ufvals=list(set(fvals))
	onames=[]
	for idx,csamp in enumerate(expdat.samples):
		onames.append(csamp+';'+fvals[idx])
	omat=np.zeros([len(fvals),len(ufvals)])
	for groupidx,groupval in enumerate(ufvals):
		cexp=hs.filtersamples(expdat,field,groupval,exact=True)
		for aidx,aval in enumerate(expdat.samples):
			cdist=[]
			for gidx,gval in enumerate(cexp.samples):
				# don't measure distance to ourselves
				if gval==aval:
					continue
				cdist.append(hs.calcdist(cexp.data[:,gidx],expdat.data[:,aidx],distmetric=distmetric))
			omat[aidx,groupidx]=np.mean(cdist)
	figure()
	iax=imshow(omat,interpolation='nearest',aspect='auto')
	ax=iax.get_axes()
	ax.set_xticks(range(len(ufvals)))
	ax.set_xticklabels(ufvals,rotation=90)
	ax.set_yticks(range(len(onames)))
	ax.set_yticklabels(onames)
Esempio n. 2
0
def sortbycentermass(expdat,field=False,numeric=True,uselog=True):
	"""
	sort bacteria in the experiment according to a 1d gradient by calculating the center of mass
	input:
	expdat
	field : string
		the name of the field to sort by or False to skip sorting
	numeric : bool
		True if the sort field is numeric (ignored if no sort field)
	uselog : bool
		True to log transform the data before mass center calculation
	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)
	dat=newexp.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	cm=[]
	multpos=np.arange(len(newexp.samples))
	for cseqind in range(len(newexp.seqs)):
		cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:]))
	sv,si=hs.isort(cm)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog))
	hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat})
	return newexp
Esempio n. 3
0
	def sortsamples(self):
		items=self.bMainList.selectedItems()
		if len(items)!=1:
			print("Need 1 item")
			return
		for citem in items:
			cname=str(citem.text())
			cexp=self.explist[cname]
			sortsampleswin = SortSamplesWindow(cexp)
			res=sortsampleswin.exec_()
			if res==QtGui.QDialog.Accepted:
				field=str(sortsampleswin.cField.currentText())
				newname=str(sortsampleswin.tNewName.text())
				cnumeric=sortsampleswin.cNumeric.checkState()
				if cnumeric==0:
					numeric=False
				else:
					numeric=True
				overwrite=sortsampleswin.cOverwrite.checkState()
				newexp=hs.sortsamples(cexp,field=field,numeric=numeric)
				if overwrite==0:
					newexp.studyname=newname
					self.addexp(newexp)
				else:
					self.replaceexp(newexp)
Esempio n. 4
0
def filterwave(expdat,field=False,numeric=True,minfold=2,minlen=3,step=1,direction='up',posloc='start'):
	"""
	filter bacteria, keeping only ones that show a consecutive region of samples with higher/lower mean than other samples
	Done by scanning all windowlen/startpos options for each bacteria
	input:
	expdat : Experiment
	field : string
		The field to sort by or False to skip sorting
	numeric : bool
		For the sorting according to field (does not matter if field is False)
	minfold : float
		The minimal fold change for the window compared to the rest in order to keep
	step : int
		The skip between tested windows (to make it faster use a larger skip)
	minlen : int
		The minimal window len for over/under expression testing
	direction : string
		'both' - test both over and under expression in the window
		'up' - only overexpressed
		'down' - only underexpressed
	posloc : string
		The position to measure the beginning ('maxstart') or middle ('maxmid') of maximal wave
		or 'gstart' to use beginning of first window with >=minfold change

	output:
	newexp : Experiment
		The filtered experiment, sorted according to window start samples position
	"""
	params=locals()

	# sort if needed
	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)

	dat=newexp.data
	dat[dat<1]=1
	dat=np.log2(dat)
	numsamples=len(newexp.samples)
	numbact=len(newexp.seqs)
	maxdiff=np.zeros([numbact])
	maxpos=np.zeros([numbact])-1
	maxlen=np.zeros([numbact])
	for startpos in range(numsamples-minlen):
		for cwin in np.arange(minlen,numsamples-startpos,step):
			meanin=np.mean(dat[:,startpos:startpos+cwin],axis=1)
			nowin=[]
			if startpos>0:
				nonwin=np.arange(startpos-1)
			if startpos<numsamples:
				nowin=np.hstack([nowin,np.arange(startpos,numsamples-1)])
			nowin=nowin.astype(int)
			meanout=np.mean(dat[:,nowin],axis=1)
			cdiff=meanin-meanout
			if direction=='both':
				cdiff=np.abs(cdiff)
			elif direction=='down':
				cdiff=-cdiff
			if posloc=='gstart':
				usepos=np.logical_and(cdiff>=minfold,maxpos==-1)
				maxpos[usepos]=startpos
			elif posloc=='start':
				maxpos[cdiff>maxdiff]=startpos
			elif posloc=='mid':
				maxpos[cdiff>maxdiff]=startpos+int(cwin/2)
			else:
				hs.Debug('posloc nut supported %s' % posloc)
				return False
			maxlen[cdiff>maxdiff]=cwin
			maxdiff=np.maximum(maxdiff,cdiff)

	keep=np.where(maxdiff>=minfold)[0]
	keeppos=maxpos[keep]
	si=np.argsort(keeppos)
	keep=keep[si]
	for ci in keep:
		hs.Debug(6,'bacteria %s startpos %d len %d diff %f' % (newexp.tax[ci],maxpos[ci],maxlen[ci],maxdiff[ci]))
	newexp=hs.reorderbacteria(newexp,keep)
	newexp.filters.append('Filter wave field=%s minlen=%d' % (field,minlen))
	hs.addcommand(newexp,"filterwave",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples)))
	return newexp