def test_subsample_without_replacement(self): """Should return a random subsample (without replacement).""" # Selecting 2 counts from the vector 1000 times yields each of the two # possible results at least once each. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample(a, 2) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)}) obs = subsample(a, 2) self.assertTrue(np.array_equal(obs, np.array([1, 0, 1])) or np.array_equal(obs, np.array([2, 0, 0])))
def test_subsample_without_replacement(self): """Should return a random subsample (without replacement).""" # Selecting 2 counts from the vector 1000 times yields each of the two # possible results at least once each. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample(a, 2) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)}) obs = subsample(a, 2) self.assertTrue( np.array_equal(obs, np.array([1, 0, 1])) or np.array_equal(obs, np.array([2, 0, 0])))
def test_subsample_invalid_input(self): """Should raise an error on invalid input.""" # Negative n. with self.assertRaises(ValueError): _ = subsample([1, 2, 3], -1) # Floats. with self.assertRaises(TypeError): _ = subsample([1, 2.3, 3], 2) # Wrong number of dimensions. with self.assertRaises(ValueError): _ = subsample([[1, 2, 3], [4, 5, 6]], 2) # Input has too few counts. with self.assertRaises(ValueError): _ = subsample([0, 5, 0], 6)
def test_subsample_with_replacement_equal_n(self): """Returns random subsample (w/ replacement) when n == counts.sum().""" a = np.array([0, 0, 3, 4, 2, 1]) actual = set() for i in range(1000): obs = subsample(a, 10, replace=True) self.assertEqual(obs.sum(), 10) actual.add(tuple(obs)) self.assertTrue(len(actual) > 1)
def test_subsample_nonrandom(self): """Should function correctly for nonrandom cases.""" a = np.array([0, 5, 0]) # Subsample same number of items that are in input (without # replacement). np.testing.assert_equal(subsample(a, 5), a) # Can only choose from one bin. exp = np.array([0, 2, 0]) np.testing.assert_equal(subsample(a, 2), exp) np.testing.assert_equal(subsample(a, 2, replace=True), exp) # Subsample zero items. a = [3, 0, 1] exp = np.array([0, 0, 0]) np.testing.assert_equal(subsample(a, 0), exp) np.testing.assert_equal(subsample(a, 0, replace=True), exp)
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None, return_b=False): """Michaelis-Menten fit to rarefaction curve of observed species Note: there is some controversy about how to do the fitting. The ML model givem by Raaijmakers 1987 is based on the assumption that error is roughly proportional to magnitude of observation, reasonable for enzyme kinetics but not reasonable for rarefaction data. Here we just do a nonlinear curve fit for the parameters using least-squares. S = Smax*n/(B + n) . n: number of individuals, S: # of species returns Smax inputs: num_repeats: will perform rarefaction (subsampling without replacement) this many times at each value of n params_guess: intial guess of Smax, B (None => default) return_b: if True will return the estimate for Smax, B. Default is just Smax the fit is made to datapoints where n = 1,2,...counts.sum(), S = species represented in random sample of n individuals """ counts = asarray(counts) if params_guess is None: params_guess = array([100, 500]) # observed # of species vs # of individuals sampled, S vs n xvals = arange(1, counts.sum() + 1) ymtx = [] for i in range(num_repeats): ymtx.append( array([observed_species(subsample(counts, n)) for n in xvals])) ymtx = asarray(ymtx) yvals = ymtx.mean(0) # fit to obs_sp = max_sp * num_idiv / (num_indiv + B) # return max_sp def fitfn(p, n): # works with vectors of n, returns vector of S return p[0] * n / (p[1] + n) def errfn(p, n, y): # vectors of actual vals y and number of individuals n return ((fitfn(p, n) - y)**2).sum() p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0) if return_b: return p1 else: return p1[0] # return only S_max, not the K_m (B) param
def test_subsample_with_replacement(self): """Should return a random subsample (with replacement).""" # Can choose from all in first bin, all in last bin (since we're # sampling with replacement), or split across bins. a = np.array([2, 0, 1]) actual = set() for i in range(1000): obs = subsample(a, 2, replace=True) actual.add(tuple(obs)) self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)}) # Test that selecting 35 counts from a 36-count vector 1000 times # yields more than 10 different subsamples. If we were subsampling # *without* replacement, there would be only 10 possible subsamples # because there are 10 nonzero bins in array a. However, there are more # than 10 possibilities when sampling *with* replacement. a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5]) actual = set() for i in range(1000): obs = subsample(a, 35, replace=True) self.assertEqual(obs.sum(), 35) actual.add(tuple(obs)) self.assertTrue(len(actual) > 10)
def michaelis_menten_fit(counts, num_repeats=1, params_guess=None, return_b=False): """Michaelis-Menten fit to rarefaction curve of observed species Note: there is some controversy about how to do the fitting. The ML model givem by Raaijmakers 1987 is based on the assumption that error is roughly proportional to magnitude of observation, reasonable for enzyme kinetics but not reasonable for rarefaction data. Here we just do a nonlinear curve fit for the parameters using least-squares. S = Smax*n/(B + n) . n: number of individuals, S: # of species returns Smax inputs: num_repeats: will perform rarefaction (subsampling without replacement) this many times at each value of n params_guess: intial guess of Smax, B (None => default) return_b: if True will return the estimate for Smax, B. Default is just Smax the fit is made to datapoints where n = 1,2,...counts.sum(), S = species represented in random sample of n individuals """ counts = asarray(counts) if params_guess is None: params_guess = array([100, 500]) # observed # of species vs # of individuals sampled, S vs n xvals = arange(1, counts.sum() + 1) ymtx = [] for i in range(num_repeats): ymtx.append(array([observed_species(subsample(counts, n)) for n in xvals])) ymtx = asarray(ymtx) yvals = ymtx.mean(0) # fit to obs_sp = max_sp * num_idiv / (num_indiv + B) # return max_sp def fitfn(p, n): # works with vectors of n, returns vector of S return p[0] * n / (p[1] + n) def errfn(p, n, y): # vectors of actual vals y and number of individuals n return ((fitfn(p, n) - y) ** 2).sum() p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0) if return_b: return p1 else: return p1[0] # return only S_max, not the K_m (B) param