Python subsampleの例、skbio.maths.subsample.subsample Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_subsample.py プロジェクト: Jorge-C/bipy

    def test_subsample_without_replacement(self):
        """Should return a random subsample (without replacement)."""
        # Selecting 2 counts from the vector 1000 times yields each of the two
        # possible results at least once each.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 2)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)})

        obs = subsample(a, 2)
        self.assertTrue(np.array_equal(obs, np.array([1, 0, 1])) or
                        np.array_equal(obs, np.array([2, 0, 0])))

コード例 #2

0

ファイルを表示

    def test_subsample_without_replacement(self):
        """Should return a random subsample (without replacement)."""
        # Selecting 2 counts from the vector 1000 times yields each of the two
        # possible results at least once each.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 2)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0)})

        obs = subsample(a, 2)
        self.assertTrue(
            np.array_equal(obs, np.array([1, 0, 1]))
            or np.array_equal(obs, np.array([2, 0, 0])))

コード例 #3

0

ファイルを表示

    def test_subsample_invalid_input(self):
        """Should raise an error on invalid input."""
        # Negative n.
        with self.assertRaises(ValueError):
            _ = subsample([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            _ = subsample([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            _ = subsample([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            _ = subsample([0, 5, 0], 6)

コード例 #4

0

ファイルを表示

ファイル: test_subsample.py プロジェクト: Jorge-C/bipy

    def test_subsample_invalid_input(self):
        """Should raise an error on invalid input."""
        # Negative n.
        with self.assertRaises(ValueError):
            _ = subsample([1, 2, 3], -1)

        # Floats.
        with self.assertRaises(TypeError):
            _ = subsample([1, 2.3, 3], 2)

        # Wrong number of dimensions.
        with self.assertRaises(ValueError):
            _ = subsample([[1, 2, 3], [4, 5, 6]], 2)

        # Input has too few counts.
        with self.assertRaises(ValueError):
            _ = subsample([0, 5, 0], 6)

コード例 #5

0

ファイルを表示

 def test_subsample_with_replacement_equal_n(self):
     """Returns random subsample (w/ replacement) when n == counts.sum()."""
     a = np.array([0, 0, 3, 4, 2, 1])
     actual = set()
     for i in range(1000):
         obs = subsample(a, 10, replace=True)
         self.assertEqual(obs.sum(), 10)
         actual.add(tuple(obs))
     self.assertTrue(len(actual) > 1)

コード例 #6

0

ファイルを表示

    def test_subsample_nonrandom(self):
        """Should function correctly for nonrandom cases."""
        a = np.array([0, 5, 0])

        # Subsample same number of items that are in input (without
        # replacement).
        np.testing.assert_equal(subsample(a, 5), a)

        # Can only choose from one bin.
        exp = np.array([0, 2, 0])
        np.testing.assert_equal(subsample(a, 2), exp)
        np.testing.assert_equal(subsample(a, 2, replace=True), exp)

        # Subsample zero items.
        a = [3, 0, 1]
        exp = np.array([0, 0, 0])
        np.testing.assert_equal(subsample(a, 0), exp)
        np.testing.assert_equal(subsample(a, 0, replace=True), exp)

コード例 #7

0

ファイルを表示

ファイル: test_subsample.py プロジェクト: Jorge-C/bipy

 def test_subsample_with_replacement_equal_n(self):
     """Returns random subsample (w/ replacement) when n == counts.sum()."""
     a = np.array([0, 0, 3, 4, 2, 1])
     actual = set()
     for i in range(1000):
         obs = subsample(a, 10, replace=True)
         self.assertEqual(obs.sum(), 10)
         actual.add(tuple(obs))
     self.assertTrue(len(actual) > 1)

コード例 #8

0

ファイルを表示

ファイル: test_subsample.py プロジェクト: Jorge-C/bipy

    def test_subsample_nonrandom(self):
        """Should function correctly for nonrandom cases."""
        a = np.array([0, 5, 0])

        # Subsample same number of items that are in input (without
        # replacement).
        np.testing.assert_equal(subsample(a, 5), a)

        # Can only choose from one bin.
        exp = np.array([0, 2, 0])
        np.testing.assert_equal(subsample(a, 2), exp)
        np.testing.assert_equal(subsample(a, 2, replace=True), exp)

        # Subsample zero items.
        a = [3, 0, 1]
        exp = np.array([0, 0, 0])
        np.testing.assert_equal(subsample(a, 0), exp)
        np.testing.assert_equal(subsample(a, 0, replace=True), exp)

コード例 #9

0

ファイルを表示

def michaelis_menten_fit(counts,
                         num_repeats=1,
                         params_guess=None,
                         return_b=False):
    """Michaelis-Menten fit to rarefaction curve of observed species

    Note: there is some controversy about how to do the fitting. The ML model
    givem by Raaijmakers 1987 is based on the assumption that error is roughly
    proportional to magnitude of observation, reasonable for enzyme kinetics
    but not reasonable for rarefaction data. Here we just do a nonlinear
    curve fit for the parameters using least-squares.


    S = Smax*n/(B + n) . n: number of individuals, S: # of species
    returns Smax

    inputs:
    num_repeats: will perform rarefaction (subsampling without replacement)
    this many times at each value of n
    params_guess: intial guess of Smax, B (None => default)
    return_b: if True will return the estimate for Smax, B. Default is just Smax

    the fit is made to datapoints where n = 1,2,...counts.sum(),
    S = species represented in random sample of n individuals

    """
    counts = asarray(counts)
    if params_guess is None:
        params_guess = array([100, 500])

    # observed # of species vs # of individuals sampled, S vs n
    xvals = arange(1, counts.sum() + 1)
    ymtx = []
    for i in range(num_repeats):
        ymtx.append(
            array([observed_species(subsample(counts, n)) for n in xvals]))
    ymtx = asarray(ymtx)
    yvals = ymtx.mean(0)

    # fit to obs_sp = max_sp * num_idiv / (num_indiv + B)
    # return max_sp
    def fitfn(p, n):  # works with vectors of n, returns vector of S
        return p[0] * n / (p[1] + n)

    def errfn(p, n, y):  # vectors of actual vals y and number of individuals n
        return ((fitfn(p, n) - y)**2).sum()

    p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0)
    if return_b:
        return p1
    else:
        return p1[0]  # return only S_max, not the K_m (B) param

コード例 #10

0

ファイルを表示

    def test_subsample_with_replacement(self):
        """Should return a random subsample (with replacement)."""
        # Can choose from all in first bin, all in last bin (since we're
        # sampling with replacement), or split across bins.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 2, replace=True)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)})

        # Test that selecting 35 counts from a 36-count vector 1000 times
        # yields more than 10 different subsamples. If we were subsampling
        # *without* replacement, there would be only 10 possible subsamples
        # because there are 10 nonzero bins in array a. However, there are more
        # than 10 possibilities when sampling *with* replacement.
        a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 35, replace=True)
            self.assertEqual(obs.sum(), 35)
            actual.add(tuple(obs))
        self.assertTrue(len(actual) > 10)

コード例 #11

0

ファイルを表示

ファイル: test_subsample.py プロジェクト: Jorge-C/bipy

    def test_subsample_with_replacement(self):
        """Should return a random subsample (with replacement)."""
        # Can choose from all in first bin, all in last bin (since we're
        # sampling with replacement), or split across bins.
        a = np.array([2, 0, 1])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 2, replace=True)
            actual.add(tuple(obs))
        self.assertEqual(actual, {(1, 0, 1), (2, 0, 0), (0, 0, 2)})

        # Test that selecting 35 counts from a 36-count vector 1000 times
        # yields more than 10 different subsamples. If we were subsampling
        # *without* replacement, there would be only 10 possible subsamples
        # because there are 10 nonzero bins in array a. However, there are more
        # than 10 possibilities when sampling *with* replacement.
        a = np.array([2, 0, 1, 2, 1, 8, 6, 0, 3, 3, 5, 0, 0, 0, 5])
        actual = set()
        for i in range(1000):
            obs = subsample(a, 35, replace=True)
            self.assertEqual(obs.sum(), 35)
            actual.add(tuple(obs))
        self.assertTrue(len(actual) > 10)

コード例 #12

0

ファイルを表示

ファイル: alpha_diversity.py プロジェクト: passdan/qiime

def michaelis_menten_fit(counts, num_repeats=1, params_guess=None,
                         return_b=False):
    """Michaelis-Menten fit to rarefaction curve of observed species

    Note: there is some controversy about how to do the fitting. The ML model
    givem by Raaijmakers 1987 is based on the assumption that error is roughly
    proportional to magnitude of observation, reasonable for enzyme kinetics
    but not reasonable for rarefaction data. Here we just do a nonlinear
    curve fit for the parameters using least-squares.


    S = Smax*n/(B + n) . n: number of individuals, S: # of species
    returns Smax

    inputs:
    num_repeats: will perform rarefaction (subsampling without replacement)
    this many times at each value of n
    params_guess: intial guess of Smax, B (None => default)
    return_b: if True will return the estimate for Smax, B. Default is just Smax

    the fit is made to datapoints where n = 1,2,...counts.sum(),
    S = species represented in random sample of n individuals

    """
    counts = asarray(counts)
    if params_guess is None:
        params_guess = array([100, 500])

    # observed # of species vs # of individuals sampled, S vs n
    xvals = arange(1, counts.sum() + 1)
    ymtx = []
    for i in range(num_repeats):
        ymtx.append(array([observed_species(subsample(counts, n))
                           for n in xvals]))
    ymtx = asarray(ymtx)
    yvals = ymtx.mean(0)

    # fit to obs_sp = max_sp * num_idiv / (num_indiv + B)
    # return max_sp
    def fitfn(p, n):  # works with vectors of n, returns vector of S
        return p[0] * n / (p[1] + n)

    def errfn(p, n, y):  # vectors of actual vals y and number of individuals n
        return ((fitfn(p, n) - y) ** 2).sum()

    p1 = fmin_powell(errfn, params_guess, args=(xvals, yvals), disp=0)
    if return_b:
        return p1
    else:
        return p1[0]  # return only S_max, not the K_m (B) param