Esempio n. 1
0
def load(tablename, mapname='map.txt', taxfile='', nameisseq=True,studyname=False,tabletype='biom',normalize=True,addsname='',keepzero=False,removefrom=False,removenum=1,mapsampletolowercase=False,sortit=True,useseqnamefortax=True,rawreads=False,usesparse=False):
	"""
	Load an experiment - a biom table and a mapping file
	input:
	tablename - the name of the biom table file
	mapname - name of the mapping file
	taxfile - empty ('') to load taxonomy from biom table, non-empty to load
	from rdp output file (web)
	nameisseq - False to keep otu name as sid without hashing it, True to treat otuid as sequence
	addsname - a string to add to each table sample name (or empty to not add)
	studyname - Flase to assign from table file name, otherwise string to store as study name
	tabletype:
		'biom' - a biom table
		'meta' - a metabolomics table (row per sample, col per metabolite, can contain duplicate metaboliteids)
	normalize - True to normalize to 10k reads per sample, False to not normalize (change to mean 10k reads/sample)
	keepzero : bool
		True (default) to keep samples with 0 reads, False to throw away
	removefrom : string
		if non empty - cut table sample name after (and including) the first occurance of removefrom
	mapsampletolowercase : bool
		True to convert the mapping file sample id to lower case (for EMP data). default=False
	sortit : bool
		True (default) to sort sequences by taxonomy, False to not sort
	useseqnamefortax : bool
		True (default) to use the sequence as taxonomy if no taxonomy supplied, False to use 'unknown'
	rawreads : bool
		True in combination with normalize=False - do not modify read count to mean 10k
	usesparse : book
		True to use sparse matrix representation, False to use non-sparse (default)

	output:
	an experiment class for the current experiment
	"""

	params=locals()

	# load the table
	if tabletype=='biom':
		hs.Debug(6,'Loading biom table')
		table=biom.load_table(tablename)
	elif tabletype=='meta':
		hs.Debug(6,'Loading metabolite table')
		table=loadmetabuckettable(tablename)
	else:
		hs.Debug(9,'Table type %s not supported' % tabletype)
		return False

	datamd5g=hashlib.md5()
	datamd5g.update(table.matrix_data.todense().A.view(np.uint8))
	datamd5=datamd5g.hexdigest()
	print(datamd5)
	# if need to cut table sample names
	if removefrom:
		idtable={}
		foundids={}
		ids=table.ids(axis='sample')
		if len(set(ids))!=len(ids):
			hs.Debug(8,'non unique ids identified')
		for cid in ids:
			if removefrom in cid:
				fpos=hs.findn(cid,removefrom,removenum)
				if fpos==-1:
					hs.Debug(6,'Did not find enough %s in %s' % (removefrom,cid))
					tid=cid
				else:
					tid=cid[:fpos]
			else:
				hs.Debug(6,'%s not found in sample name %s (removefrom)' % (removefrom,cid))
				tid=cid
			if tid in foundids:
				hs.Debug(6,'already have id %s' % cid)
				foundids[tid]+=1
				idtable[cid]=tid+'-rep-'+str(foundids[tid])
				print(idtable[cid])
			else:
				foundids[tid]=1
				idtable[cid]=tid
		hs.Debug(6,'found %d keys %d values' % (len(set(idtable.keys())),len(set(idtable.values()))))
		table=table.update_ids(idtable,axis='sample')

	# if need to add constant string to sample names in table
	if addsname!='':
		idtable={}
		ids=table.ids(axis='sample')
		for cid in ids:
			idtable[cid]=addsname+cid
		table=table.update_ids(idtable,axis='sample')


	smap = {}
	mapsamples = []
	mapmd5=''
	if mapname:
		# if mapping file supplied, load it
		mapsamples,smap,fields,mapmd5=loadmap(mapname,mapsampletolowercase=mapsampletolowercase)
	else:
		# no mapping file, so just create the #SampleID field
		hs.Debug(6,'No mapping file supplied - using just sample names')
		tablesamples = table.ids(axis='sample')
		for cid in tablesamples:
			smap[cid]={'#SampleID':cid}
			mapsamples.append(cid)
		fields=['#SampleID']
		mapmd5=''

	# remove table samples not in mapping file
	tablesamples = table.ids(axis='sample')
	hs.Debug(6,'number of samples in table is %d' % len(tablesamples))
	removelist=[]
	for cid in tablesamples:
		if cid not in mapsamples:
			removelist.append(cid)
			hs.Debug(6,'Table sample %s not found in mapping file' % cid)
	hs.Debug(6,'removing %s samples' % len(removelist))
	if len(removelist)>0:
		table=table.filter(removelist,axis='sample',invert=True)

	tablesamples = table.ids(axis='sample')
	hs.Debug(6,'deleted. number of samples in table is now %d' % len(tablesamples))

	# remove samples not in table from mapping file
	removemap=[]
	addlist=[]
	for idx,cmap in enumerate(mapsamples):
		if cmap not in tablesamples:
			hs.Debug(2,'Map sample %s not in table' % cmap)
			if not keepzero:
				removemap.append(idx)
				try:
					del smap[cmap]
				except:
					hs.Debug(8,'Duplicate SampleID %s in mapping file' % cmap)
			else:
				addlist.append(cmap)
	if len(removemap)>0:
		hs.Debug(7,'removing %d samples from mapping file' % len(removemap))
		mapsamples=hs.delete(mapsamples,removemap)
	hs.Debug(7,'number of samples in mapping file is now %d' % len(mapsamples))

	# get info about the sequences
	tableseqs = table.ids(axis='observation')
	sids = []
	tax = []
	osnames=[]
	for cid in tableseqs:
		# get the original sample name
		osnames.append(cid)
		# get the sid (hash )
		if nameisseq:
			sids.append(hs.hashseq(cid))
		else:
			sids.append(cid)
		# get the taxonomy string
		ctax=gettaxfromtable(table,cid,useseqname=useseqnamefortax)
		tax.append(ctax)

	if not studyname:
		studyname=os.path.basename(tablename)

	exp=hs.Experiment()
	exp.datatype=tabletype
	if usesparse:
		exp.data=scipy.sparse.dok_matrix(table.matrix_data)
	else:
		exp.data=table.matrix_data.todense().A
	# check if need to add the 0 read samples to the data
	if len(addlist)>0:
		tablesamples=list(tablesamples)
		tablesamples=tablesamples+addlist
		exp.data=np.hstack([exp.data,np.zeros([np.shape(exp.data)[0],len(addlist)])])
	exp.smap=smap
	exp.samples=tablesamples
	exp.seqs=tableseqs
	for idx,cseq in enumerate(exp.seqs):
		exp.seqdict[cseq]=idx
	exp.sids=sids
	exp.origotunames=osnames
	exp.tax=tax
	exp.tablefilename=tablename
	exp.studyname=studyname
	exp.mapfilename=tablename
	exp.filters = [tablename]
	exp.fields = fields
	exp.datamd5 = datamd5
	exp.mapmd5 = mapmd5
	colsum=np.sum(exp.data,axis=0,keepdims=False)
	exp.origreads=list(colsum)
	# add the original number of reads as a field to the experiment
	exp.fields.append('origReads')
	for idx,csamp in enumerate(exp.samples):
		exp.smap[csamp]['origReads']=str(exp.origreads[idx])

	# normalize samples to 10k reads per samples
	colsum=np.sum(exp.data,axis=0,keepdims=True)
	okreads=np.where(colsum>0)
	if np.size(colsum)-np.size(okreads[1])>0:
		print("Samples with 0 reads: %d" % (np.size(colsum)-np.size(okreads[1])))
		if not keepzero:
			exp=hs.reordersamples(exp,okreads[1])
		colsum=np.sum(exp.data,axis=0,keepdims=True)
	if tabletype=='meta':
		normalize=False

	if normalize:
		exp.data=10000*exp.data/colsum
	else:
		if not rawreads:
			exp.data=10000*exp.data/np.mean(colsum)

	exp.uniqueid=exp.getexperimentid()
	if sortit:
		exp=hs.sortbacteria(exp,logit=False)
	hs.addcommand(exp,"load",params=params)
	exp.filters.append('loaded table=%s, map=%s' % (tablename,mapname))
	return(exp)