def test_subsample(self): """subsample should return a random subsample of a vector""" a = array([0,5,0]) self.assertEqual(subsample(a,5), array([0,5,0])) self.assertEqual(subsample(a,2), array([0,2,0])) b = array([2,0,1]) obs = subsample(b,2) assert (obs == array([1,0,1])).all() or (obs == array([2,0,0])).all()
def filter_otus_by_lineage(sample_ids, otu_ids, otu_table, lineages, \ wanted_lineage, max_seqs_per_sample, min_seqs_per_sample): """Filter OTU table to keep only desired lineages and sample sizes.""" #first step: figure out which OTUs we want to keep if wanted_lineage is not None: #None = keep all if '&&' in wanted_lineage: wanted_lineage = set(wanted_lineage.split('&&')) else: wanted_lineage = set([wanted_lineage]) good_indices = [] for i,l in enumerate(lineages): if set(l).intersection(wanted_lineage): good_indices.append(i) otu_table = otu_table[good_indices] otu_ids = map(otu_ids.__getitem__, good_indices) lineages = map(lineages.__getitem__, good_indices) #now have reduced collection of OTUs filtered by lineage. #figure out which samples will be dropped because too small big_enough_samples = (otu_table.sum(0)>=min_seqs_per_sample).nonzero() otu_table = otu_table[:,big_enough_samples[0]] sample_ids = map(sample_ids.__getitem__, big_enough_samples[0]) #figure out which samples will be reduced because too big too_big_samples = (otu_table.sum(0)>max_seqs_per_sample).nonzero()[0] if too_big_samples.shape[0]: #means that there were some for i in too_big_samples: otu_table[:,i] = subsample(otu_table[:,i].ravel(), \ max_seqs_per_sample) return sample_ids, otu_ids, otu_table, lineages
def get_rare_data(sample_ids, otu_table, seqs_per_sample, include_small_samples=False): """Filter OTU table to keep only desired sample sizes. - include_small_sampes=False => do not write samples with < seqs_per_sample total sequecnes - otu_table (input and out) is otus(rows) by samples (cols) - no otus are removed, even if they are absent in the rarefied table""" res_otu_table = otu_table.copy() res_sample_ids = sample_ids #figure out which samples will be dropped because too small too_big_samples = (otu_table.sum(0) > seqs_per_sample).nonzero()[0] if too_big_samples.shape[0]: #means that there were some for i in too_big_samples: res_otu_table[:, i] = subsample(otu_table[:, i].ravel(), seqs_per_sample) if not include_small_samples: big_enough_samples = (res_otu_table.sum(0) >= seqs_per_sample).nonzero() res_otu_table = res_otu_table[:, big_enough_samples[0]] res_sample_ids = map(sample_ids.__getitem__, big_enough_samples[0]) #figure out which samples will be reduced because too big return res_sample_ids, res_otu_table
def test_subsample(self): """subsample should return a random subsample of a vector""" a = array([0,5,0]) self.assertEqual(subsample(a,5), array([0,5,0])) self.assertEqual(subsample(a,2), array([0,2,0])) b = array([2,0,1]) # selecting 2 counts from the vector 1000 times yields each of the # two possible results at least once each b = array([2,0,1]) actual = {} for i in range(1000): e = subsample(b,2) actual[tuple(e)] = None self.assertEqual(actual, {(1,0,1):None,(2,0,0):None}) obs = subsample(b,2) assert (obs == array([1,0,1])).all() or (obs == array([2,0,0])).all()
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None, return_b=False): """Michaelis-Menten fit to rarefaction curve of observed species Note: there is some controversy about how to do the fitting. The ML model givem by Raaijmakers 1987 is based on the assumption that error is roughly proportional to magnitude of observation, reasonable for enzyme kinetics but not reasonable for rarefaction data. Here we just do a nonlinear curve fit for the parameters using least-squares. S = Smax*n/(B + n) . n: number of individuals, S: # of species returns Smax inputs: num_repeats: will perform rarefaction (subsampling without replacement) this many times at each value of n params_guess: intial guess of Smax, B (None => default) return_b: if True will return the estimate for Smax, B. Default is just Smax the fit is made to datapoints where n = 1,2,...counts.sum(), S = species represented in random sample of n individuals """ counts = asarray(counts) if params_guess is None: params_guess = array([100, 500]) # observed # of species vs # of individuals sampled, S vs n xvals = arange(1, counts.sum() + 1) ymtx = [] for i in range(num_repeats): ymtx.append( array([ observed_species(rarefaction.subsample(counts, n)) for n in xvals ])) ymtx = asarray(ymtx) yvals = ymtx.mean(0) # fit to obs_sp = max_sp * num_idiv / (num_indiv + B) # return max_sp def fitfn(p, n): # works with vectors of n, returns vector of S return p[0] * n / (p[1] + n) def errfn(p, n, y): # vectors of actual vals y and number of individuals n return ((fitfn(p, n) - y)**2).sum() p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0) if return_b: return p1 else: return p1[0] # return only S_max, not the K_m (B) param
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None, return_b=False): """Michaelis-Menten fit to rarefaction curve of observed species Note: there is some controversy about how to do the fitting. The ML model givem by Raaijmakers 1987 is based on the assumption that error is roughly proportional to magnitude of observation, reasonable for enzyme kinetics but not reasonable for rarefaction data. Here we just do a nonlinear curve fit for the parameters using least-squares. S = Smax*n/(B + n) . n: number of individuals, S: # of species returns Smax inputs: num_repeats: will perform rarefaction (subsampling without replacement) this many times at each value of n params_guess: intial guess of Smax, B (None => default) return_b: if True will return the estimate for Smax, B. Default is just Smax the fit is made to datapoints where n = 1,2,...counts.sum(), S = species represented in random sample of n individuals """ counts = asarray(counts) if params_guess is None: params_guess = array([100,500]) # observed # of species vs # of individuals sampled, S vs n xvals = arange(1,counts.sum()+1) ymtx = [] for i in range(num_repeats): ymtx.append( array([observed_species(rarefaction.subsample(counts,n)) \ for n in xvals])) ymtx = asarray(ymtx) yvals = ymtx.mean(0) # fit to obs_sp = max_sp * num_idiv / (num_indiv + B) # return max_sp def fitfn(p,n): # works with vectors of n, returns vector of S return p[0]*n/(p[1] + n) def errfn(p,n,y): # vectors of actual vals y and number of individuals n return ((fitfn(p,n) - y)**2).sum() p1 = fmin_powell(errfn, params_guess, args=(xvals,yvals), disp=0) if return_b: return p1 else: return p1[0] # return only S_max, not the K_m (B) param