Exemple #1
0
def sortbycentermass(expdat,field=False,numeric=True,uselog=True):
	"""
	sort bacteria in the experiment according to a 1d gradient by calculating the center of mass
	input:
	expdat
	field : string
		the name of the field to sort by or False to skip sorting
	numeric : bool
		True if the sort field is numeric (ignored if no sort field)
	uselog : bool
		True to log transform the data before mass center calculation
	output:
	newexp - the experiment with sorted bacteria
	"""
	params=locals()

	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)
	dat=newexp.data
	if uselog:
		dat[dat<1]=1
		dat=np.log2(dat)
	cm=[]
	multpos=np.arange(len(newexp.samples))
	for cseqind in range(len(newexp.seqs)):
		cm.append(np.dot(dat[cseqind,:],multpos)/np.sum(dat[cseqind,:]))
	sv,si=hs.isort(cm)
	newexp=hs.reorderbacteria(expdat,si)
	newexp.filters.append("sort by center of mass field=%s, uselog=%s" % (field,uselog))
	hs.addcommand(newexp,"sortbycentermass",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #2
0
def savetsvtable(expdat,filename,logtransform=True):
	"""
	save an experiment as a tab separated table, with columns for samples and rows for bacteria
	for jose clemente long babies paper
	input:
	expdat
	filename - name of the output tsv file
	minreads - save only bacteria with >=minreads reads
	logtransform - True to save the log2 of the reads, False to save the reads
	"""

	ldat=hs.copyexp(expdat.data)
	if logtransform:
		ldat[np.where(ldat<1)]=1
		ldat=np.log2(ldat)

	of=open(filename,'w')
	of.write("Taxonomy\tSequence")
	for csamp in expdat.samples:
		of.write("\t%s" % csamp)
	of.write("\n")
	for idx,cseq in enumerate(expdat.seqs):
		of.write("%s\t%s" % (expdat.tax[idx],cseq))
		for cval in ldat[idx,:]:
			of.write("\t%f" % cval)
		of.write("\n")
	of.close()
Exemple #3
0
def normalizeprctile(expdat,percent=80):
	"""
	normalize reads per experiment so percentile (rather than mean) will be normalized
	used to reduce effect of outliers (compositionality correction)
	note normalization is done on the same set of bacteria for all samples
	input:
	expdat : Experiment
	percent : float
		the percentile to normalize (0-100)

	output:
	newexp : Experiment
		the new normalized experiment
	"""
	params=locals()

	# select the bacteria to use - don't want to include very low freq. bacteria
	newexp=hs.filterminreads(expdat,1*len(expdat.samples))

	percvals=np.percentile(newexp.data,percent,axis=0)
#	plt.figure()
#	plt.plot(percvals)
	percvals=percvals/np.mean(percvals)
	newexp=hs.copyexp(expdat)
	for idx,samp in enumerate(expdat.samples):
		newexp.data[:,idx]=newexp.data[:,idx]*percvals[idx]
	newexp.filters.append("normalize percentile %f" % percent)
	hs.addcommand(newexp,"normalizeprctile",params=params,replaceparams={'expdat':expdat})

	return newexp
Exemple #4
0
def convertdatefield(expdat,field,newfield,timeformat='%m/%d/%y %H:%M'):
	"""
	convert a field containing date/time to a numeric (seocds since epoch) field (create a new field for that)
	input:
	expdat : Experiment
		the experiment to add the field to
	field : string
		name of the field containing the date/time format
	newfield : string
		name of the new field (with seconds since epoch)
	timeformat : string
		format of the date/time field (based on time format)
	output:
	newexp : Experiment
		the experiment with the added time since epoch field
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	newexp.fields.append(newfield)
	for csamp in newexp.samples:
		newexp.smap[csamp][newfield]=time.mktime(time.strptime(newexp.smap[csamp][field],timeformat))
	newexp.filters.append('add time field %s (based on field %s)' % (newfield,field))
	hs.addcommand(newexp,"convertdatefield",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Exemple #5
0
def reloadmap(expdat,mapfilename):
	"""
	reload the mapping file for a loaded experiment

	input:
	expdat : Experiment
	mapfilename : string
		Name of the mapping file to reload

	output:
	newexp : Experiment
		like expdat but with fields from new map file
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	mapsamples,smap,fields,mapmd5=loadmap(mapfilename)
	newexp.smap=smap
	newexp.fields=fields
	newexp.mapmd5=mapmd5
	for csamp in newexp.samples:
		if csamp not in mapsamples:
			hs.Debug(7,'Sample %s not in new map!' % csamp)
	newexp.filters.append('reload map %s' % mapfilename)
	hs.addcommand(newexp,"reloadmapfile",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #6
0
def renamesamples(expdat,addstr,addbefore=True):
	"""
	rename all the samples in expdat by adding addbefore before or after the name of each sample

	input:
	expdat : Experiment
		the experiment to change the sample names in
	addstr : str
		the string to add to each sampleid
	addbefore : bool (optional)
		True (default) to add addstr before each sampleid
		False to add addstr after each sampleid

	output:
	newexp : Experiment
		with new sample names
	"""
	newexp=hs.copyexp(expdat)
	newids=[]
	newmap={}
	for csamp in newexp.samples:
		if addbefore:
			cnewid=addstr+csamp
		else:
			cnewid=csamp+addstr
		newids.append(cnewid)
		newmap[cnewid]={}
		for ckey,cval in newexp.smap[csamp].items():
			newmap[cnewid][ckey]=cval
	newexp.samples=newids
	newexp.smap=newmap
	return newexp
Exemple #7
0
def toorigreads(expdat,inplace=False):
	"""
	convert the number of reads to absolute using the origreads field
	input:
	expdat
	inplace - True to replace current exp, false to create a new one

	output:
	newexp - each sample has origreads reads (instead of 10k)
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	for idx,csamp in enumerate(newexp.samples):
		totreads=np.sum(newexp.data[:,idx])
		origreads=newexp.origreads[idx]
		if totreads==0:
			continue
		ratio=float(origreads)/totreads
		newexp.data[:,idx]=newexp.data[:,idx]*ratio
	newexp.filters.append("changed reads to origread value")
	hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #8
0
def toorigreads(expdat,inplace=False):
	"""
	convert the number of reads to absolute using the origreads field
	input:
	expdat
	inplace - True to replace current exp, false to create a new one

	output:
	newexp - each sample has origreads reads (instead of 10k)
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	newexp.data=hs.multvec(newexp.data,newexp.scalingfactor)
	newexp.data=np.round(newexp.data)
	newexp.data=newexp.data.astype(int)
	newexp.scalingfactor=1

	newexp.filters.append("changed reads to origread value")
	hs.addcommand(newexp,"toorigreads",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #9
0
def fieldtobact(expdat,field,bactname='',meanreads=1000,cutoff=0):
	"""
	convert values in a map file field to a new bacteria (to facilitate numeric analysis)
	input:
	expdat : Experiment
	field : string
		name of the field to convert
	bactname : string
		name of the new bacteria (empty to have similar to field name)
	meanreads : int
		the mean number of reads for the new field bacteria
	cutoff : int
		the minimal value of the field per sample (otherwise replace with meanreads)

	output:
	newexp : Experiment
		with added bacteria with the field vals as reads
	"""
	params=locals()

	if len(bactname)==0:
		bactname=field
	fv=hs.getfieldvals(expdat,field)
	vals=np.array(hs.tofloat(fv))
	okpos=np.where(vals>=cutoff)[0]
	badpos=np.where(vals<cutoff)[0]
	scalefactor=np.mean(vals[okpos])
	vals[okpos]=(vals[okpos]/scalefactor)*meanreads
	vals[badpos]=meanreads
	newexp=hs.copyexp(expdat)
	hs.insertbacteria(newexp,vals,bactname,bactname,logit=False)
	newexp.filters.append('add bacteria from map field %s' % field)
	hs.addcommand(newexp,"fieldtobact",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Exemple #10
0
def addmapfield(expdat,fieldname,defaultval='NA',inplace=False):
	"""
	add a new field to the mapping file

	input:
	expdat : Experiment
	fieldname : str
		name of the new field
	defaultval : str
		the value for all samples
	inplace : bool
		True to overwrite current experiment, False (default) to copy

	output:
	newexp : Experiment
		with the new field added
	"""
	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	if fieldname in newexp.fields:
		hs.Debug(8,'field %s already exists')
		return newexp
	newexp.fields.append(fieldname)
	for csamp in newexp.samples:
		newexp.smap[csamp][fieldname]=defaultval
	return newexp
Exemple #11
0
def normalizereads(expdat,numreads=10000,fixorig=False,inplace=False):
	"""
	normalize the number of reads per sample (default to 10k)
	input:
	expdat
	numreads - the number of reads to normalize to
	fixorig - True to fix origreads with the same ratio, False to keep as before
	inplace - true to replace orig experiment, false to create a new experiment

	output:
	newexp - the normalized experiment
	"""
	params=locals()

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	for idx,csamp in enumerate(newexp.samples):
		totreads=np.sum(newexp.data[:,idx])
		if totreads==0:
			continue
		ratio=float(numreads)/totreads
		newexp.data[:,idx]=newexp.data[:,idx]*ratio
		if fixorig:
			hs.Debug(2,'fixing original frequencies')
			newexp.origreads[idx]=float(newexp.origreads[idx])/ratio
	newexp.filters.append("renormalized reads to sum %d" % numreads)
	hs.addcommand(newexp,"normalizereads",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #12
0
def normalizebyseqs(expdat,seqs,exclude=False,fixorig=True):
	"""
	normalize experiment by making the sum of frequencies in seqs constant in each sample
	input:
	expdat
	seqs - the sequences to use as the normalization factor (sum of the sequences)
	exclude - true to use all sequences except in seqs as the normalization factor, False to use seqs
	fixorig - True to modify the origreads field, false to leave it as it was
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	spos=[]
	for cseq in seqs:
		spos.append(expdat.seqdict[cseq])
	if exclude:
		spos=np.setdiff1d(np.arange(len(expdat.seqs)),spos)
	ssum=np.sum(expdat.data[spos,:],axis=0)+0.0
	ssum[ssum==0]=1
	frat=ssum/np.mean(ssum)
	for idx in range(len(expdat.samples)):
		newexp.data[:,idx]=newexp.data[:,idx]/frat[idx]
		if fixorig:
			newexp.origreads[idx]=newexp.origreads[idx]/frat[idx]
			newexp.scalingfactor[idx]=newexp.scalingfactor[idx]*frat
	filt='Normalize By Seqs '
	if len(spos)==1:
		filt+=newexp.tax[spos[0]]
	else:
		filt+=str(len(spos))
	if exclude:
		filt+=' Exclude'
	newexp.filters.append(filt)
	hs.addcommand(newexp,"normalizebyseqs",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #13
0
def addsubtrees(expdat,tree,inplace=False):
	"""
	add otus for all subtrees with the frequency being the sum of all bacteria in the subtree
	input:
	expdat - the experiment
	tree - the tree for the experiment
	inplace - if true, replace current experiment

	output:
	newexp - the new experiment with twice-1 number of otus
	"""
	params=locals()
#	if not expdat.tree:
#		hs.Debug(8,"No tree loaded for experiment")
#		return False

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	subtrees=tree.subsets()
	for csubtree in subtrees:
		newname=""
		newtax=""
		numuse=0
		newfreq=np.zeros([1,len(newexp.samples)])
		for cbact in csubtree:
			if cbact not in newexp.seqdict:
				hs.Debug(4,'sequence not in seqdict',cbact)
				continue
			numuse+=1
			cpos=newexp.seqdict[cbact]
			newfreq+=newexp.data[cpos,:]
			newname+='%d,' % cpos
			if newtax=='':
				newtax=newexp.tax[cpos]
			else:
				newtax=hs.common_start(newtax,newexp.tax[cpos])
		# add only if we have 2 bacteria or more
		if numuse>1:
			if newname not in newexp.seqdict:
				newexp,newpos=insertbacteria(newexp,freqs=newfreq,seq=newname,tax=newtax,logit=False)

	newexp.filters.append("Add subtrees")
	hs.addcommand(newexp,"addsubtrees",params=params,replaceparams={'expdat':expdat})
	return(newexp)
Exemple #14
0
def filtermapfields(expdat,fields=['#SampleID'],keep=True,inplace=False):
	"""
	filter fields from the experiment mapping data

	input:
	expdat : Experiment
	fields : list of str
		the list of the fields to keep/remove
	keep : bool (optional)
		True (default) to keep only the fields specified
		False to remove the fields specified
	inplace : bool (optional)
		False (default) to create new experiment
		True to replace in current experiment

	output:
	newexp : Experiment
		with only the fields requested
	"""
	params=locals()

	newsmap={}
	newfields=set(expdat.fields)
	if keep:
		newfields=newfields.intersection(set(fields))
	else:
		newfields=newfields.difference(set(fields))

	newfields.add('#SampleID')

	for csamp in expdat.samples:
		newsmap[csamp]={}
		for cfield in newfields:
			newsmap[csamp][cfield]=expdat.smap[csamp][cfield]

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)
	newexp.fields=list(newfields)
	newexp.smap=newsmap

	expdat.filters.append('filter map fields %s (keep=%s)' % (fields,keep))
	hs.addcommand(expdat,"filtermapfields",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #15
0
def sortbysign(expdat,field=False,value='',exclude=False,exact=True,maxfval=0.2):
	"""
	sort bacteria in the experiment based on the number of positive/negative samples
	(ignoring nans)
	input:
	expdat : Experiment
	field,value,exclude,exact : name of field and value of field in order to sort based only on these samples
		or field=False for all samples (default)
	maxfval - the maximal f-value

	output:
	newexp : Experiment
		sorted by difference between positive/negative
	"""
	params=locals()

	if field:
		texp=hs.filtersamples(expdat,field,value,exact=exact,exclude=exclude)
	else:
		texp=hs.copyexp(expdat)

	texp.data=np.sign(texp.data)
	numpos=np.nansum(texp.data>0,axis=1)
	numneg=np.nansum(texp.data<0,axis=1)
	pval=np.ones(len(numpos))
	for cpos in range(len(pval)):
		if numpos[cpos]==0 and numneg[cpos]==0:
			continue
		pval1=stats.binom.cdf(numpos[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval2=stats.binom.cdf(numneg[cpos],numpos[cpos]+numneg[cpos],0.5)
		pval[cpos]=np.min([pval1,pval2])

	signs=np.nanmean(texp.data,axis=1)

	fval=hs.fdr(pval)
	keep=np.where(np.array(fval)<=maxfval)[0]
	newexp=hs.reorderbacteria(expdat,keep)
	signs=signs[keep]
	si=np.argsort(signs)

	newexp=hs.reorderbacteria(newexp,si)
	newexp.filters.append("sort by sign field %s max-f-val %f" % (field,maxfval))
	hs.addcommand(newexp,"sortbysign",params=params,replaceparams={'expdat':expdat})
	return newexp
Exemple #16
0
def zerobacteria(expdat,inplace=False):
	"""
	zero all the bacteria in an experiment (can then add insertbacteria)
	input:
	expdat : Experiment
	inplace : bool
		True to do inplace, False to make new copy

	output:
	newexp : Experiment
		all bacteria have been removed
	"""
	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	newexp.data=np.zeros([0,len(newexp.samples)])
	newexp.seqs=[]
	newexp.tax=[]
	newexp.seqdict={}
	newexp.sids=[]
	return newexp
Exemple #17
0
def taxtoseq(expdat,fixtax=False):
	"""
	put the taxonomy into the sequence field

	input:
	expdat : Experiment
	fixtax: bool (optional)
		False (default) to just copy, True to remove the k__ etc.

	output:
	newexp : Experiment
		with seqs=taxonomies
	"""
	newexp=hs.copyexp(expdat)
	newexp.seqs=newexp.tax
	if fixtax:
		newtax=[]
		for ctax in newexp.tax:
			cstr=''
			cctax=ctax.split(';')
			for clevel in range(7):
				if len(cctax)>clevel:
					cstr+=cctax[clevel][3:]
				cstr+=';'
			newtax.append(cstr)
		newexp.seqs=newtax
	newexp.seqdict={}
	newseqs=[]
	for idx,cseq in enumerate(newexp.seqs):
		if cseq in newexp.seqdict:
			hs.Debug(8,'found %s again' % cseq)
			cseq=cseq+'-'+str(idx)
		newseqs.append(cseq)
		newexp.seqdict[cseq]=idx
	newexp.seqs=newseqs
	return(newexp)
Exemple #18
0
	def cleantaxonomy(self):
		items=self.bMainList.selectedItems()
		if len(items)!=1:
			print("Need 1 item")
			return
		for citem in items:
			cname=str(citem.text())
			cexp=self.explist[cname]
			ctwin = CleanTaxonomyWindow(cexp)
			res=ctwin.exec_()
			if res==QtGui.QDialog.Accepted:
				newexp=hs.copyexp(cexp)
				if ctwin.cMitochondria.checkState():
					newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True)
				if ctwin.cChloroplast.checkState():
					newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True)
					newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True)
				if ctwin.cUnknown.checkState():
					newexp=hs.filtertaxonomy(newexp,'nknown',exclude=True)
				if ctwin.cBacteria.checkState():
					newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True)
				newexp=hs.normalizereads(newexp)
				newexp.studyname=cexp.studyname+'.ct'
				self.addexp(newexp)
Exemple #19
0
def changemapval(expdat,newfield,newval,oldfield,vals,inplace=False):
	"""
	change values of a field in the mapping file according to another field

	input:
	expdat : Experiment
	newfield : name of the field to change the values in (from addmapfield?)
	newval : the new value to put
	oldfield : the field with the values to test
	vals : a list of values, so newfield is set to newval only if the the value of oldfield is in the list
	inplace : bool
		True to overwrite current experiment, False (default) to copy
	"""

	if inplace:
		newexp=expdat
	else:
		newexp=hs.copyexp(expdat)

	for csamp in newexp.samples:
		if newexp.smap[csamp][oldfield] in vals:
			newexp.smap[csamp][newfield]=newval

	return newexp
Exemple #20
0
def cleantaxonomy(expdat,mitochondria=True,chloroplast=True,bacteria=True,unknown=True,exclude=True):
	"""
	remove common non-16s sequences from the experiment and renormalize

	input:
	expdat : Experiment
	mitochondria : bool
		remove mitochondrial sequences
	chloroplast : bool
		remove chloroplast sequences
	bacteria : bool
		remove sequences only identified as "Bacteria" (no finer identification)
	unknown : bool
		remove unknown sequences
	exclude : bool
		True (default) to remove these sequecnes, False to keep them and throw other

	output:
	newexp : Experiment
		the renormalized experiment without these bacteria
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	if mitochondria:
		if exclude:
			newexp=hs.filtertaxonomy(newexp,'mitochondria',exclude=True)
		else:
			ne1=hs.filtertaxonomy(newexp,'mitochondria',exclude=False)
	if chloroplast:
		if exclude:
			newexp=hs.filtertaxonomy(newexp,'Streptophyta',exclude=True)
			newexp=hs.filtertaxonomy(newexp,'Chloroplast',exclude=True)
		else:
			ne2=hs.filtertaxonomy(newexp,'Streptophyta',exclude=False)
			ne3=hs.filtertaxonomy(newexp,'Chloroplast',exclude=False)
	if unknown:
		if exclude:
			newexp=hs.filtertaxonomy(newexp,'Unknown',exclude=True)
			newexp=hs.filtertaxonomy(newexp,'Unclassified;',exclude=True,exact=True)
		else:
			ne4=hs.filtertaxonomy(newexp,'Unknown',exclude=False)
			ne5=hs.filtertaxonomy(newexp,'Unclassified;',exclude=False,exact=True)
	if bacteria:
		if exclude:
			newexp=hs.filtertaxonomy(newexp,'Bacteria;',exclude=True,exact=True)
		else:
			ne6=hs.filtertaxonomy(newexp,'Bacteria;',exclude=False,exact=True)
	if exclude:
		newexp=hs.normalizereads(newexp)
	else:
		allseqs=[]
		allseqs+=(ne1.seqs)
		allseqs+=(ne2.seqs)
		allseqs+=(ne3.seqs)
		allseqs+=(ne4.seqs)
		allseqs+=(ne5.seqs)
		allseqs+=(ne6.seqs)
		allseqs=list(set(allseqs))
		newexp=hs.filterseqs(newexp,allseqs)
	newexp.filters.append('Clean Taxonomy (remove mitochondria etc.)')
	hs.addcommand(newexp,"cleantaxonomy",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d sequences before filtering, %d after' % (len(expdat.seqs),len(newexp.seqs)))
	return newexp
Exemple #21
0
def filtersimilarsamples(expdat,field,method='mean'):
	"""
	join similar samples into one sample (i.e. to remove samples of same individual)
	input:
	expdat : Experiment
	field : string
		Name of the field containing the values (for which similar values will be joined)
	method : string
		What to do with samples with similar value. options:
		'mean' - replace with a sample containing the mean of the samples
		'median'- replace with a sample containing the median of the samples
		'random' - replace with a single random sample out of these samples
		'sum' - replace with sum of original reads in all samples, renormalized after to 10k and orignumreads updated
		'fracpres' - replace with fraction of samples where the bacteria is present
	output:
	newexp : Experiment
		like the input experiment but only one sample per unique value in field
	"""
	params=locals()

	newexp=hs.copyexp(expdat)
	if method=='sum':
		newexp=hs.toorigreads(newexp)
	uvals=hs.getfieldvals(expdat,field,ounique=True)
	keep=[]
	for cval in uvals:
		cpos=hs.findsamples(expdat,field,cval)
		if len(cpos)==1:
			keep.append(cpos[0])
			continue
		if method=='random':
			keep.append(cpos[np.random.randint(len(cpos))])
			continue
		# set the mapping file values
		cmap=expdat.smap[expdat.samples[cpos[0]]]
		for ccpos in cpos[1:]:
			for cfield in cmap.keys():
				if cmap[cfield]!=expdat.smap[expdat.samples[ccpos]][cfield]:
					cmap[cfield]='NA'
		if method=='mean':
			cval=np.mean(expdat.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			keep.append(cpos[0])
		elif method=='median':
			cval=np.median(expdat.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			keep.append(cpos[0])
		elif method=='sum':
			cval=np.sum(newexp.data[:,cpos],axis=1)
			newexp.data[:,cpos[0]]=cval
			newexp.origreads[cpos[0]]=np.sum(hs.reorder(expdat.origreads,cpos))
			keep.append(cpos[0])
		elif method=='fracpres':
			cval=np.sum(expdat.data[:,cpos]>0,axis=1)
			newexp.data[:,cpos[0]]=cval/len(cpos)
			keep.append(cpos[0])
		else:
			hs.Debug(9,'method %s not supported' % method)
			return False
		newexp.smap[expdat.samples[cpos[0]]]=cmap
	newexp=hs.reordersamples(newexp,keep)
	if method=='sum':
		newexp=hs.normalizereads(newexp)
	newexp.filters.append('Filter similar samples field %s method %s' % (field,method))
	hs.addcommand(newexp,"filtersimilarsamples",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples)))
	return newexp
Exemple #22
0
def plotexp(exp,sortby=False,numeric=False,minreads=4,rangeall=False,seqdb=None,cdb=None,showline=True,ontofig=False,usegui=True,showxall=False,showcolorbar=False,ptitle=False,lowcutoff=1,uselog=True,showxlabel=True,colormap=False,colorrange=False):
	"""
	Plot an experiment
	input:
	exp - from load()
	sortby - name of mapping file field to sort by or Flase to not sort
	numeric - True if the field is numeric
	minreads - minimum number of reads per bacteria in order to show it or 0 to show all
	rangeall - True to show all frequencies in image scale, false to saturate at 10%
	seqdb - the SRBactDB database (from bactdb.load)
	cdb - the cool sequences database (from cooldb.load)
	showline - if True plot lines between category values
	ontofig - name of ontology to plot for bactdb or false to no plot
	usegui - True use a gui for otu summary, False just print
	showxall - True to show all sample names when not sorting, False to show no more than 10
	showcolorbar - True to plot the colorbar. False to not plot
	ptitle - name of the figure or False to show processing history as name
	lowcutoff - minimal value for read (for 0 log transform) - the minimal resolution - could be 10000*2/origreads
	showxlabel : bool
		True to show the x label (default), False to hide it
	colormap : string or False
		name of colormap or False (default) to use mpl default colormap
	colorrange : [min,max] or False
		[min,max] to set the colormap range, False to use data min,max (default) as specified in rangeall

	output:
	newexp - the plotted experiment (sorted and filtered)
	ax - the plot axis
	"""

	hs.Debug(1,"Plot experiment %s" % exp.studyname)
	hs.Debug(1,"Commands:")
	for ccommand in exp.commands:
		hs.Debug(1,"%s" % ccommand)
	vals=[]
	if sortby:
		hs.Debug(1,"Sorting by field %s" % sortby)
		for csamp in exp.samples:
			vals.append(exp.smap[csamp][sortby])
		if numeric:
			hs.Debug(1,"(numeric sort)")
			vals=hs.tofloat(vals)
		svals,sidx=hs.isort(vals)
		newexp=hs.reordersamples(exp,sidx)
	else:
		hs.Debug(1,"No sample sorting")
		svals=hs.getfieldvals(exp,'#SampleID')
		newexp=hs.copyexp(exp)
	hs.Debug(1,"Filtering min reads. original bacteria - %d" % len(newexp.seqs))
	if minreads>0:
		newexp=hs.filterminreads(newexp,minreads,logit=uselog)
	hs.Debug(1,"New number of bacteria %d" % len(newexp.seqs))
	newexp.seqdb=seqdb
	newexp.cdb=cdb

#	ldat=ldat[:,sidx]
	ldat=newexp.data
	if uselog:
		hs.Debug(1,"Using log, cutoff at %f" % lowcutoff)
		ldat[np.where(ldat<lowcutoff)]=lowcutoff
		ldat=np.log2(ldat)
	oldparams=plt.rcParams
	mpl.rc('keymap',back='c, backspace')
	mpl.rc('keymap',forward='v')
	mpl.rc('keymap',all_axes='A')
	f=figure()
	# set the colormap to default if not supplied
	if not colormap:
		colormap=plt.rcParams['image.cmap']
	# plot the image
	if colorrange:
		hs.Debug(1,"colormap range is 0,10")
		iax=imshow(ldat,interpolation='nearest',aspect='auto',clim=colorrange,cmap=plt.get_cmap(colormap))
	elif rangeall:
		hs.Debug(1,"colormap range is all")
		iax=imshow(ldat,interpolation='nearest',aspect='auto',cmap=plt.get_cmap(colormap))
	else:
		hs.Debug(1,"colormap range is 0,10")
		iax=imshow(ldat,interpolation='nearest',aspect='auto',clim=[0,10],cmap=plt.get_cmap(colormap))

	if not ptitle:
		hs.Debug(1,"Showing filters in title")
		if (len(newexp.filters))>4:
			cfilters=[newexp.filters[0],'...',newexp.filters[-2],newexp.filters[-1]]
		else:
			cfilters=newexp.filters
		cfilters=hs.clipstrings(cfilters,30)
		ptitle='\n'.join(cfilters)
	title(ptitle,fontsize=10)

	ax=iax.get_axes()
	ax.autoscale(False)
	if showline:
		hs.Debug(1,"Showing lines")
		labs=[]
		labpos=[]
		linepos=[]
		minpos=0
		svals.append('end')
		for idx,cval in enumerate(svals[:-1]):
			if cval==svals[idx+1]:
				continue
			labpos.append(minpos-0.5+float(idx+1-minpos)/2)
			minpos=idx+1
			linepos.append(idx+0.5)
			labs.append(cval)
		hs.Debug(1,"number of lines is %d" % len(linepos))
		if showxlabel:
			ax.set_xticks(labpos)
			ax.set_xticklabels(labs,rotation=45,ha='right')
		for cx in linepos:
			plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'k',linewidth=2)
	else:
		hs.Debug(1,"Not showing lines")
		if showxall or len(newexp.samples)<=10:
			hs.Debug(1,"less than 10 samples, showing all sample names")
			ax.set_xticklabels(svals,rotation=90)
			ax.set_xticks(range(len(newexp.samples)))
	tight_layout()
	ax.set_ylim(-0.5,np.size(ldat,0)+0.5)

	if showcolorbar:
		hs.Debug(1,"Showing colorbar")
		cb=colorbar(ticks=list(np.log2([2,10,100,500,1000])))
		cb.ax.set_yticklabels(['<0.02%','0.1%','1%','5%','>10%'])

	# create the plot
	ax.expdat=newexp
	ax.lastselect=-1
	ax.sampline=''
	ax.ofig=f
	ax.labelson=False
	ax.labelnames=[]
	f.canvas.mpl_connect('button_press_event', onplotmouseclick)
	f.canvas.mpl_connect('key_press_event', onplotkeyclick)
#	show()
	plt.rcParams=oldparams

	# if want the ontology analysis for a given category:
	if ontofig:
		hs.Debug(1,"Ontofig is set")
		newexp.ontofigname=ontofig
	else:
		newexp.ontofigname=False

	# if we want gui, open it
	if usegui:
		hs.Debug(1,"Using the GUI window")
		import heatsequer.plots.plotwingui
		guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp)
#		from heatsequer.plots import plotwingui
#		guiwin = plotwingui.PlotGUIWindow(newexp)
		ax.guiwin=guiwin
		guiwin.plotfig=f
		guiwin.plotax=ax
		guiwin.show()
	else:
		ax.guiwin=False
		hs.Debug(7,'Not using gui')

	if newexp.plotmetadata:
		hs.Debug(1,"Experiment has metadata attached for plotting (%d points)" % len(newexp.plotmetadata))
		for cmet in newexp.plotmetadata:
			addplotmetadata(newexp,field=cmet[0],value=cmet[1],color=cmet[2],inverse=cmet[3],beforesample=cmet[4])
	show()
	return newexp,ax
Exemple #23
0
def plotexp(exp,sortby=False,numeric=False,minreads=4,rangeall=False,seqdb=None,cdb=None,showline=True,ontofig=False,usegui=True,showxall=False,showcolorbar=False,ptitle=False,lowcutoff=1,uselog=True,showxlabel=True,colormap=False,colorrange=False,linewidth=2,subline='',showhline=True,newfig=True,fixfont=False,fontsize=None,nosort=False,zeroisnone=False,xlabelrotation=45,showtaxnames=False):
	"""
	Plot an experiment
	input:
	exp - from load()
	sortby - name of mapping file field to sort by or Flase to not sort
	numeric - True if the field is numeric
	minreads - minimum number of reads per bacteria in order to show it or 0 to show all
	rangeall - True to show all frequencies in image scale, false to saturate at 10%
	seqdb - the SRBactDB database (from bactdb.load)
	cdb - the cool sequences database (from cooldb.load), or None (default) to use the heatsequer loaded cdb
	showline - if True plot lines between category values
	ontofig - name of ontology to plot for bactdb or false to no plot
	usegui - True use a gui for otu summary, False just print
	showxall - True to show all sample names when not sorting, False to show no more than 10
	showcolorbar - True to plot the colorbar. False to not plot
	ptitle : str (optional)
		'' to show o show processing history as name, None to not show title, or str of name of the figure
	lowcutoff - minimal value for read (for 0 log transform) - the minimal resolution - could be 10000*2/origreads
	showxlabel : bool
		True to show the x label (default), False to hide it
	colormap : string or False
		name of colormap or False (default) to use mpl default colormap
	colorrange : [min,max] or False
		[min,max] to set the colormap range, False to use data min,max (default) as specified in rangeall
	subline : str
		Name of category for subline plotting or '' (Default) for no sublines
	showhline : bool
		True (default) to plot the horizontal lines listed in exp.hlines. False to not plot them
	newfig : bool
		True (default) to open figure in new window, False to use current
	fixfont : bool (optional)
		False (default) to use fixedfont, True to use fixed width font
	fontsize : int or None (optional)
		None (default) to use default font size, number to use that font size
	nosort : bool (optional)
		False (default) to sort by the sort field, True to skip the sorting
	zeroisnone : bool (optional)
		False (default) to plot zeros as 0, True to assign None (white color)
	xlabelrotation : int (optional)
		the rotation of the xtick labels
	showtaxnames : book (optional)
		False (default) to not show tax names (need to press 'h' to show)
		True to show the taxonomy names

	output:
	newexp - the plotted experiment (sorted and filtered)
	ax - the plot axis
	"""

	hs.Debug(1,"Plot experiment %s" % exp.studyname)
	hs.Debug(1,"Commands:")
	for ccommand in exp.commands:
		hs.Debug(1,"%s" % ccommand)

	if exp.sparse:
		hs.Debug(9,'Sparse matrix - converting to dense')
		exp=hs.copyexp(exp,todense=True)

	vals=[]
	if cdb is None:
		cdb=hs.cdb
	if seqdb is None:
		seqdb=hs.bdb
	if sortby:
		if not nosort:
			hs.Debug(1,"Sorting by field %s" % sortby)
			for csamp in exp.samples:
				vals.append(exp.smap[csamp][sortby])
			if numeric:
				hs.Debug(1,"(numeric sort)")
				vals=hs.tofloat(vals)
			svals,sidx=hs.isort(vals)
			newexp=hs.reordersamples(exp,sidx)
		else:
			hs.Debug(1,"no sorting but showing columns")
			svals=hs.getfieldvals(exp,sortby)
			newexp=hs.copyexp(exp)
	else:
		hs.Debug(1,"No sample sorting")
		svals=hs.getfieldvals(exp,'#SampleID')
		newexp=hs.copyexp(exp)
	hs.Debug(1,"Filtering min reads. original bacteria - %d" % len(newexp.seqs))
	if minreads>0:
		newexp=hs.filterminreads(newexp,minreads,logit=uselog)
	hs.Debug(1,"New number of bacteria %d" % len(newexp.seqs))
	newexp.seqdb=seqdb
	newexp.cdb=cdb
	newexp.scdb=hs.scdb

	# if usegui:
	# 	hs.Debug(1,"Using the GUI window")
	# 	import heatsequer.plots.plotwingui
	# 	from PyQt4 import QtGui

	# 	app = QtGui.QApplication(sys.argv)
	# 	guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp)

#	ldat=ldat[:,sidx]
	ldat=newexp.data
	if zeroisnone:
		ldat[ldat==0]=None
	if uselog:
		hs.Debug(1,"Using log, cutoff at %f" % lowcutoff)
		ldat[np.where(ldat<lowcutoff)]=lowcutoff
		ldat=np.log2(ldat)
	oldparams=plt.rcParams
	mpl.rc('keymap',back='c, backspace')
	mpl.rc('keymap',forward='v')
	mpl.rc('keymap',all_axes='A')
	if newfig:
		f=plt.figure(tight_layout=True)
	else:
		f=plt.gcf()
	# set the colormap to default if not supplied
	if not colormap:
		colormap=plt.rcParams['image.cmap']
	# plot the image
	if colorrange:
		hs.Debug(1,"colormap range is 0,10")
		iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',clim=colorrange,cmap=plt.get_cmap(colormap))
	elif rangeall:
		hs.Debug(1,"colormap range is all")
		iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',cmap=plt.get_cmap(colormap))
	else:
		hs.Debug(1,"colormap range is 0,10")
		iax=plt.imshow(ldat,interpolation='nearest',aspect='auto',clim=[0,10],cmap=plt.get_cmap(colormap))

	if ptitle is not None:
		if not ptitle:
			hs.Debug(1,"Showing filters in title")
			if (len(newexp.filters))>4:
				cfilters=[newexp.filters[0],'...',newexp.filters[-2],newexp.filters[-1]]
			else:
				cfilters=newexp.filters
			cfilters=hs.clipstrings(cfilters,30)
			ptitle='\n'.join(cfilters)
		plt.title(ptitle,fontsize=10)

	ax=iax.get_axes()
	ax.autoscale(False)

	# plot the sublines (smaller category lines)
	if subline:
		slval=hs.getfieldvals(newexp,subline)
		prevval=slval[0]
		for idx,cval in enumerate(slval):
			if cval!=prevval:
				xpos=idx-0.5
				plt.plot([xpos,xpos],[-0.5,np.size(ldat,0)-0.5],'w:')
				prevval=cval

	if showline:
		hs.Debug(1,"Showing lines")
		labs=[]
		labpos=[]
		linepos=[]
		minpos=0
		svals.append('end')
		for idx,cval in enumerate(svals[:-1]):
			if cval==svals[idx+1]:
				continue
			labpos.append(minpos-0.5+float(idx+1-minpos)/2)
			minpos=idx+1
			linepos.append(idx+0.5)
			labs.append(cval)
		hs.Debug(1,"number of lines is %d" % len(linepos))
		if showxlabel:
			ax.set_xticks(labpos)
			ax.set_xticklabels(labs,rotation=xlabelrotation,ha='right')
		for cx in linepos:
			plt.plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'k',linewidth=linewidth)
			plt.plot([cx,cx],[-0.5,np.size(ldat,0)-0.5],'w:',linewidth=linewidth)
	else:
		hs.Debug(1,"Not showing lines")
		if showxall or len(newexp.samples)<=10:
			hs.Debug(1,"less than 10 samples, showing all sample names")
			ax.set_xticklabels(svals,rotation=90)
			ax.set_xticks(range(len(newexp.samples)))
	# f.tight_layout()
	ax.set_ylim(-0.5,np.size(ldat,0)-0.5)

	if fixfont:
		fontProperties = {'family':'monospace'}
		ax.set_yticklabels(ax.get_yticks(), fontProperties)

	if showcolorbar:
		hs.Debug(1,"Showing colorbar")
		cb=plt.colorbar(ticks=list(np.log2([2,10,100,500,1000])))
		cb.ax.set_yticklabels(['<0.02%','0.1%','1%','5%','>10%'])

	# create the plot
	ax.expdat=newexp
	ax.lastselect=-1
	ax.sampline=''
	ax.ofig=f
	ax.labelson=False
	ax.labelnames=[]
	f.canvas.mpl_connect('button_press_event', onplotmouseclick)
	f.canvas.mpl_connect('key_press_event', onplotkeyclick)
#	show()
	plt.rcParams=oldparams

	# if want the ontology analysis for a given category:
	if ontofig:
		hs.Debug(1,"Ontofig is set")
		newexp.ontofigname=ontofig
	else:
		newexp.ontofigname=False

	# if we want gui, open it
	if usegui:
		hs.Debug(1,"Using the GUI window")
		import heatsequer.plots.plotwingui
#		from PyQt4 import QtGui

#		app = QtGui.QApplication(sys.argv)
		guiwin = heatsequer.plots.plotwingui.PlotGUIWindow(newexp)
		from heatsequer.plots import plotwingui
		guiwin = plotwingui.PlotGUIWindow(newexp)
		ax.guiwin=guiwin
		guiwin.plotfig=f
		guiwin.plotax=ax
		guiwin.show()
	else:
		ax.guiwin=False
		hs.Debug(7,'Not using gui')

	ax.plot_labelsize=fontsize
	if newexp.plotmetadata:
		hs.Debug(1,"Experiment has metadata attached for plotting (%d points)" % len(newexp.plotmetadata))
		for cmet in newexp.plotmetadata:
			addplotmetadata(newexp,field=cmet[0],value=cmet[1],color=cmet[2],inverse=cmet[3],beforesample=cmet[4])
	if showhline:
		if newexp.hlines:
			for cpos in newexp.hlines:
				plt.plot([0,np.shape(newexp.data)[1]],[cpos-0.5,cpos-0.5],'g')
	plt.show()
	if showtaxnames:
		showtaxonomies(newexp,ax,showdb=False,showcontam=False)

#	if usegui:
#		app.exec_()

	return newexp,ax
Exemple #24
0
def joinexperiments(exp1,exp2,missingval='NA',origfieldname='origexp',addbefore=False):
	"""
	join 2 experiments into a new experiment. adding a new field origfieldname
	input:
	exp1,exp2 - the experiments to join
	missingval - string to put when field not in mapping file of one of the experiments
	origfieldname - name of the new field to add which contains the original experiment name
	addbefore : bool (optional)
		False (default) to add '-1'/'-2' after sampleid if similar ids in both experiments
		True to add '-1'/'-2' after sampleid if similar ids in both experiments
	"""
	params=locals()

	# test if same sampleid exists in both experiments. if so, add "-1" and "-2" to sampleid
	samp1=set(exp1.samples)
	samp2=set(exp2.samples)
	if len(samp1.intersection(samp2))>0:
		hs.Debug(6,'same sampleID - renaming samples')
		exp1=hs.renamesamples(exp1,'-1',addbefore=addbefore)
		exp2=hs.renamesamples(exp2,'-2',addbefore=addbefore)

	# join the sequences of both experiments
	# ASSUMING SAME SEQ LENGTH!!!!
	allseqs=list(set(exp1.seqs) | set(exp2.seqs))
	alldict={}
	alltax=[]
	allids=[]
	for idx,cseq in enumerate(allseqs):
		alldict[cseq]=idx
	# make the new joined data for each experiment
	dat1=np.zeros((len(allseqs),np.size(exp1.data,1)))
	for idx,cseq in enumerate(allseqs):
		if cseq in exp1.seqdict:
			dat1[idx,:]=exp1.data[exp1.seqdict[cseq],:]
			alltax.append(exp1.tax[exp1.seqdict[cseq]])
			allids.append(exp1.sids[exp1.seqdict[cseq]])
		else:
			alltax.append(exp2.tax[exp2.seqdict[cseq]])
			allids.append(exp2.sids[exp2.seqdict[cseq]])

	dat2=np.zeros((len(allseqs),np.size(exp2.data,1)))
	for idx,cseq in enumerate(allseqs):
		if cseq in exp2.seqdict:
			dat2[idx,:]=exp2.data[exp2.seqdict[cseq],:]

	newexp=hs.copyexp(exp1)
	# concatenate the reads
	newexp.data=np.concatenate((dat1,dat2), axis=1)
	newexp.seqdict=alldict
	newexp.seqs=allseqs
	newexp.tax=alltax
	newexp.sids=allids
	newexp.sids=newexp.seqs
	newexp.samples = list(exp1.samples) + list(exp2.samples)
	newexp.origreads=exp1.origreads+exp2.origreads
	newexp.scalingfactor=np.hstack([exp1.scalingfactor,exp2.scalingfactor])
	newexp.fields=list(set(exp1.fields+exp2.fields))

	for cfield in newexp.fields:
		if cfield in exp1.fields:
			continue
		for csamp in exp1.samples:
			newexp.smap[csamp][cfield]=missingval

	for csamp in exp2.samples:
		newexp.smap[csamp]={}
		for cfield in newexp.fields:
			if cfield in exp2.fields:
				newexp.smap[csamp][cfield]=exp2.smap[csamp][cfield]
			else:
				newexp.smap[csamp][cfield]=missingval

	for csamp in exp1.samples:
		if origfieldname in exp1.fields:
			cname=exp1.smap[csamp][origfieldname]
		else:
			cname=exp1.studyname
		newexp.smap[csamp][origfieldname]=cname
	for csamp in exp2.samples:
		if origfieldname in exp2.fields:
			cname=exp2.smap[csamp][origfieldname]
		else:
			cname=exp2.studyname
		newexp.smap[csamp][origfieldname]=cname
	if origfieldname not in newexp.fields:
		newexp.fields.append(origfieldname)

	newexp.filters.append('joined with %s' % exp2.studyname)
	hs.addcommand(newexp,"joinexperiments",params=params,replaceparams={'exp1':exp1,'exp2':exp2})
	return newexp
Exemple #25
0
def filterwave(expdat,field=False,numeric=True,minfold=2,minlen=3,step=1,direction='up',posloc='start'):
	"""
	filter bacteria, keeping only ones that show a consecutive region of samples with higher/lower mean than other samples
	Done by scanning all windowlen/startpos options for each bacteria
	input:
	expdat : Experiment
	field : string
		The field to sort by or False to skip sorting
	numeric : bool
		For the sorting according to field (does not matter if field is False)
	minfold : float
		The minimal fold change for the window compared to the rest in order to keep
	step : int
		The skip between tested windows (to make it faster use a larger skip)
	minlen : int
		The minimal window len for over/under expression testing
	direction : string
		'both' - test both over and under expression in the window
		'up' - only overexpressed
		'down' - only underexpressed
	posloc : string
		The position to measure the beginning ('maxstart') or middle ('maxmid') of maximal wave
		or 'gstart' to use beginning of first window with >=minfold change

	output:
	newexp : Experiment
		The filtered experiment, sorted according to window start samples position
	"""
	params=locals()

	# sort if needed
	if field:
		newexp=hs.sortsamples(expdat,field,numeric=numeric)
	else:
		newexp=hs.copyexp(expdat)

	dat=newexp.data
	dat[dat<1]=1
	dat=np.log2(dat)
	numsamples=len(newexp.samples)
	numbact=len(newexp.seqs)
	maxdiff=np.zeros([numbact])
	maxpos=np.zeros([numbact])-1
	maxlen=np.zeros([numbact])
	for startpos in range(numsamples-minlen):
		for cwin in np.arange(minlen,numsamples-startpos,step):
			meanin=np.mean(dat[:,startpos:startpos+cwin],axis=1)
			nowin=[]
			if startpos>0:
				nonwin=np.arange(startpos-1)
			if startpos<numsamples:
				nowin=np.hstack([nowin,np.arange(startpos,numsamples-1)])
			nowin=nowin.astype(int)
			meanout=np.mean(dat[:,nowin],axis=1)
			cdiff=meanin-meanout
			if direction=='both':
				cdiff=np.abs(cdiff)
			elif direction=='down':
				cdiff=-cdiff
			if posloc=='gstart':
				usepos=np.logical_and(cdiff>=minfold,maxpos==-1)
				maxpos[usepos]=startpos
			elif posloc=='start':
				maxpos[cdiff>maxdiff]=startpos
			elif posloc=='mid':
				maxpos[cdiff>maxdiff]=startpos+int(cwin/2)
			else:
				hs.Debug('posloc nut supported %s' % posloc)
				return False
			maxlen[cdiff>maxdiff]=cwin
			maxdiff=np.maximum(maxdiff,cdiff)

	keep=np.where(maxdiff>=minfold)[0]
	keeppos=maxpos[keep]
	si=np.argsort(keeppos)
	keep=keep[si]
	for ci in keep:
		hs.Debug(6,'bacteria %s startpos %d len %d diff %f' % (newexp.tax[ci],maxpos[ci],maxlen[ci],maxdiff[ci]))
	newexp=hs.reorderbacteria(newexp,keep)
	newexp.filters.append('Filter wave field=%s minlen=%d' % (field,minlen))
	hs.addcommand(newexp,"filterwave",params=params,replaceparams={'expdat':expdat})
	hs.Debug(6,'%d samples before filtering, %d after' % (len(expdat.samples),len(newexp.samples)))
	return newexp
Exemple #26
0
def joinexperiments(exp1,exp2,missingval='NA',origfieldname='origexp'):
	"""
	join 2 experiments into a new experiment. adding a new field origfieldname
	input:
	exp1,exp2 - the experiments to join
	missingval - string to put when field not in mapping file of one of the experiments
	origfieldname - name of the new field to add which contains the original experiment name
	"""
	params=locals()

	# join the sequences of both experiments
	# ASSUMING SAME SEQ LENGTH!!!!
	allseqs=list(set(exp1.seqs) | set(exp2.seqs))
	alldict={}
	alltax=[]
	allids=[]
	for idx,cseq in enumerate(allseqs):
		alldict[cseq]=idx
	# make the new joined data for each experiment
	dat1=np.zeros((len(allseqs),np.size(exp1.data,1)))
	for idx,cseq in enumerate(allseqs):
		if cseq in exp1.seqdict:
			dat1[idx,:]=exp1.data[exp1.seqdict[cseq],:]
			alltax.append(exp1.tax[exp1.seqdict[cseq]])
			allids.append(exp1.sids[exp1.seqdict[cseq]])
		else:
			alltax.append(exp2.tax[exp2.seqdict[cseq]])
			allids.append(exp2.sids[exp2.seqdict[cseq]])

	dat2=np.zeros((len(allseqs),np.size(exp2.data,1)))
	for idx,cseq in enumerate(allseqs):
		if cseq in exp2.seqdict:
			dat2[idx,:]=exp2.data[exp2.seqdict[cseq],:]

	newexp=hs.copyexp(exp1)
	# concatenate the reads
	newexp.data=np.concatenate((dat1,dat2), axis=1)
	newexp.seqdict=alldict
	newexp.seqs=allseqs
	newexp.tax=alltax
	newexp.sids=allids
	newexp.sids=newexp.seqs
	newexp.samples = list(exp1.samples) + list(exp2.samples)
	newexp.origreads=exp1.origreads+exp2.origreads
	newexp.fields=list(set(exp1.fields+exp2.fields))

	for cfield in newexp.fields:
		if cfield in exp1.fields:
			continue
		for csamp in exp1.samples:
			newexp.smap[csamp][cfield]=missingval

	for csamp in exp2.samples:
		newexp.smap[csamp]={}
		for cfield in newexp.fields:
			if cfield in exp2.fields:
				newexp.smap[csamp][cfield]=exp2.smap[csamp][cfield]
			else:
				newexp.smap[csamp][cfield]=missingval

	for csamp in exp1.samples:
		newexp.smap[csamp][origfieldname]=exp1.studyname
	for csamp in exp2.samples:
		newexp.smap[csamp][origfieldname]=exp2.studyname
	newexp.fields.append(origfieldname)

	newexp.filters.append('joined with %s' % exp2.studyname)
	hs.addcommand(newexp,"joinexperiments",params=params,replaceparams={'exp1':exp1,'exp2':exp2})
	return newexp