def test_fit_only_loc_scale(self): # fit only loc N = 5000 rng = np.random.default_rng(self.seed) dist = stats.norm loc, scale = 1.5, 1 data = dist.rvs(loc=loc, size=N, random_state=rng) loc_bounds = (0, 5) bounds = {'loc': loc_bounds} res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params, (loc, scale), **self.tols) # fit only scale loc, scale = 0, 2.5 data = dist.rvs(scale=scale, size=N, random_state=rng) scale_bounds = (0, 5) bounds = {'scale': scale_bounds} res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params, (loc, scale), **self.tols) # fit only loc and scale dist = stats.norm loc, scale = 1.5, 2.5 data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng) bounds = {'loc': loc_bounds, 'scale': scale_bounds} res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params, (loc, scale), **self.tols)
def test_basic_fit(self, dist_name): N = 5000 dist_data = dict(distcont + distdiscrete) rng = np.random.default_rng(self.seed) dist = getattr(stats, dist_name) shapes = np.array(dist_data[dist_name]) bounds = np.empty((len(shapes) + 2, 2), dtype=np.float64) bounds[:-2, 0] = shapes / 10 # essentially all shapes are > 0 bounds[:-2, 1] = shapes * 10 bounds[-2] = (0, 10) bounds[-1] = (0, 10) loc = rng.uniform(*bounds[-2]) scale = rng.uniform(*bounds[-1]) ref = list(dist_data[dist_name]) + [loc, scale] if getattr(dist, 'pmf', False): ref = ref[:-1] ref[-1] = np.floor(loc) data = dist.rvs(*ref, size=N, random_state=rng) res = stats.fit(dist, data, bounds[:-1], optimizer=self.opt) if getattr(dist, 'pdf', False): data = dist.rvs(*ref, size=N, random_state=rng) res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params, ref, **self.tols)
def test_dist_iv(self): message = "`dist` must be an instance of..." with pytest.raises(ValueError, match=message): stats.fit(10, self.data, self.shape_bounds_a) message = "Distribution `laplace` is not yet supported by..." with pytest.raises(ValueError, match=message): stats.fit(stats.laplace, self.data)
def test_guess(self): # Test that guess helps DE find the desired solution N = 2000 rng = np.random.default_rng(self.seed) dist = stats.nhypergeom params = (20, 7, 12, 0) bounds = [(2, 200), (0.7, 70), (1.2, 120), (0, 10)] data = dist.rvs(*params, size=N, random_state=rng) res = stats.fit(dist, data, bounds, optimizer=self.opt) assert not np.allclose(res.params, params, **self.tols) res = stats.fit(dist, data, bounds, guess=params, optimizer=self.opt) assert_allclose(res.params, params, **self.tols)
def test_basic_fit(self, dist_name): N = 5000 dist_data = dict(distcont + distdiscrete) rng = np.random.default_rng(self.seed) dist = getattr(stats, dist_name) shapes = np.array(dist_data[dist_name]) bounds = np.empty((len(shapes) + 2, 2), dtype=np.float64) bounds[:-2, 0] = shapes / 10**np.sign(shapes) bounds[:-2, 1] = shapes * 10**np.sign(shapes) bounds[-2] = (0, 10) bounds[-1] = (0, 10) loc = rng.uniform(*bounds[-2]) scale = rng.uniform(*bounds[-1]) ref = list(dist_data[dist_name]) + [loc, scale] if getattr(dist, 'pmf', False): ref = ref[:-1] ref[-1] = np.floor(loc) data = dist.rvs(*ref, size=N, random_state=rng) bounds = bounds[:-1] if getattr(dist, 'pdf', False): data = dist.rvs(*ref, size=N, random_state=rng) with npt.suppress_warnings() as sup: sup.filter(RuntimeWarning, "overflow encountered") res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_nllf_less_or_close(dist, data, res.params, ref, **self.tols)
def test_nchypergeom_wallenius(self): # The NC hypergeometric distributions are more challenging N = 5000 rng = np.random.default_rng(self.seed) dist = stats.nchypergeom_wallenius shapes = (14, 8, 6, 0.5) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0, 20), (0, 10), (0, 10), (0, 0.5)] res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params[:-1], shapes, **self.tols)
def test_yulesimon(self): # yulesimon fit is not very sensitive to alpha except for small alpha N = 5000 rng = np.random.default_rng(self.seed) dist = stats.yulesimon params = (1.5, 4) data = dist.rvs(*params, size=N, random_state=rng) bounds = [(0.15, 15), (0, 10)] res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params, params, **self.tols)
def test_randint(self): # randint is overparameterized; test_basic_fit finds equally good fit N = 5000 rng = np.random.default_rng(self.seed) dist = stats.randint shapes = (7, 31) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0, 70), (0, 310)] res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params[:2], shapes, **self.tols)
def test_nbinom(self): # Fitting nbinom doesn't always get original shapes if loc is free N = 7000 rng = np.random.default_rng(self.seed) dist = stats.nbinom shapes = (5, 0.5) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0.5, 50), (0.05, 5)] res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params[:-1], shapes, **self.tols)
def test_nhypergeom(self): # DE doesn't find optimum for the bounds in `test_basic_fit`. NBD. N = 2000 rng = np.random.default_rng(self.seed) dist = stats.nhypergeom shapes = (20, 7, 12) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0, 30)] * 3 res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params[:-1], (20, 7, 12), **self.tols)
def test_hypergeom(self): # hypergeometric distribution (M, n, N) \equiv (M, N, n) N = 1000 rng = np.random.default_rng(self.seed) dist = stats.hypergeom shapes = (20, 7, 12) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0, 30)] * 3 res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params[:-1], shapes, **self.tols)
def test_data_iv(self): message = "`data` must be exactly one-dimensional." with pytest.raises(ValueError, match=message): stats.fit(self.dist, [[1, 2, 3]], self.shape_bounds_a) message = "All elements of `data` must be finite numbers." with pytest.raises(ValueError, match=message): stats.fit(self.dist, [1, 2, 3, np.nan], self.shape_bounds_a) with pytest.raises(ValueError, match=message): stats.fit(self.dist, [1, 2, 3, np.inf], self.shape_bounds_a) with pytest.raises(ValueError, match=message): stats.fit(self.dist, ['1', '2', '3'], self.shape_bounds_a)
def test_boltzmann(self): # Boltzmann distribution shape is very insensitive to parameter N N = 1000 rng = np.random.default_rng(self.seed) dist = stats.boltzmann shapes = (1.4, 19, 4) data = dist.rvs(*shapes, size=N, random_state=rng) bounds = [(0, 30)] * 2 + [(0, 10)] res = stats.fit(dist, data, bounds, optimizer=self.opt) assert_allclose(res.params[0], 1.4, **self.tols) assert_allclose(res.params[2], 4, **self.tols)
def test_missing_shape_bounds(self): # some distributions have a small domain w.r.t. a parameter, e.g. # $p \in [0, 1]$ for binomial distribution # User does not need to provide these because the intersection of the # user's bounds (none) and the distribution's domain is finite N = 1000 rng = np.random.default_rng(self.seed) dist = stats.binom n, p, loc = 10, 0.65, 0 data = dist.rvs(n, p, loc=loc, size=N, random_state=rng) shape_bounds = {'n': np.array([0, 20])} # check arrays are OK, too res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params, (n, p, loc), **self.tols) dist = stats.bernoulli p, loc = 0.314159, 0 data = dist.rvs(p, loc=loc, size=N, random_state=rng) res = stats.fit(dist, data, optimizer=self.opt) assert_allclose(res.params, (p, loc), **self.tols)
def test_truncweibull_min(self): # Can't guarantee that all distributions will fit all data with # arbitrary bounds. This distribution just happens to fail above. # Try something slightly different. N = 1000 rng = np.random.default_rng(self.seed) dist = stats.truncweibull_min shapes = (2.5, 0.25, 1.75, 2., 3.) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = [(0.1, 10)] * 5 res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_nllf_less_or_close(dist, data, res.params, shapes, **self.tols)
def test_foldnorm(self): # Can't guarantee that all distributions will fit all data with # arbitrary bounds. This distribution just happens to fail above. # Try something slightly different. N = 1000 rng = np.random.default_rng(self.seed) dist = stats.foldnorm shapes = (1.952125337355587, 2., 3.) data = dist.rvs(*shapes, size=N, random_state=rng) shape_bounds = {'c': (0.1, 10), 'loc': (0.1, 10), 'scale': (0.1, 10)} res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_nllf_less_or_close(dist, data, res.params, shapes, **self.tols)
def test_failure(self): N = 5000 rng = np.random.default_rng(self.seed) dist = stats.nbinom shapes = (5, 0.5) data = dist.rvs(*shapes, size=N, random_state=rng) assert data.min() == 0 # With lower bounds on location at 0.5, likelihood is zero bounds = [(0, 30), (0, 1), (0.5, 10)] res = stats.fit(dist, data, bounds) message = "Optimization converged to parameter values that are" assert res.message.startswith(message) assert res.success is False
def test_everything_fixed(self): N = 5000 rng = np.random.default_rng(self.seed) dist = stats.norm loc, scale = 1.5, 2.5 data = dist.rvs(loc=loc, scale=scale, size=N, random_state=rng) # loc, scale fixed to 0, 1 by default res = stats.fit(dist, data) assert_allclose(res.params, (0, 1), **self.tols) # loc, scale explicitly fixed bounds = {'loc': (loc, loc), 'scale': (scale, scale)} res = stats.fit(dist, data, bounds) assert_allclose(res.params, (loc, scale), **self.tols) # `n` gets fixed during polishing dist = stats.binom n, p, loc = 10, 0.65, 0 data = dist.rvs(n, p, loc=loc, size=N, random_state=rng) shape_bounds = {'n': (0, 20), 'p': (0.65, 0.65)} res = stats.fit(dist, data, shape_bounds, optimizer=self.opt) assert_allclose(res.params, (n, p, loc), **self.tols)
def test_dist_iv(self): message = "`dist` must be an instance of..." with pytest.raises(ValueError, match=message): stats.fit(10, self.data, self.shape_bounds_a)
def test_guess_iv(self): message = "Guesses provided for the following unrecognized..." guess = {'n': 1, 'p': 0.5, '1': 255} with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "Each element of `guess` must be a scalar..." guess = {'n': 1, 'p': 'hi'} with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) guess = [1, 'f'] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) guess = [[1, 2]] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "A `guess` sequence must contain at least 2..." guess = [1] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "A `guess` sequence may not contain more than 3..." guess = [1, 2, 3, 4] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "Guess for parameter `n` rounded..." guess = {'n': 4.5, 'p': -0.5} with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "Guess for parameter `loc` rounded..." guess = [5, 0.5, 0.5] with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "Guess for parameter `p` clipped..." guess = {'n': 5, 'p': -0.5} with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess) message = "Guess for parameter `loc` clipped..." guess = [5, 0.5, 1] with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, self.shape_bounds_d, guess=guess)
def fit_dist_plot(samples, stats): seaborn.set(color_codes=True) axes = seaborn.distplot(samples, fit=stats) (μ, σ) = stats.fit(samples) axes.text(0.75, 0.9, "μ=%d, σ=%d" % (μ, σ), transform=axes.transAxes)
def test_bounds_iv(self): message = "Bounds provided for the following unrecognized..." shape_bounds = {'n': (1, 10), 'p': (0, 1), '1': (0, 10)} with pytest.warns(RuntimeWarning, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "Each element of a `bounds` sequence must be a tuple..." shape_bounds = [(1, 10, 3), (0, 1)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "Each element of `bounds` must be a tuple specifying..." shape_bounds = [(1, 10, 3), (0, 1, 0.5)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) shape_bounds = [1, 0] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "A `bounds` sequence must contain at least 2 elements..." shape_bounds = [(1, 10)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "A `bounds` sequence may not contain more than 3 elements..." bounds = [(1, 10), (1, 10), (1, 10), (1, 10)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, bounds) message = "There are no values for `p` on the interval..." shape_bounds = {'n': (1, 10), 'p': (1, 0)} with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "There are no values for `n` on the interval..." shape_bounds = [(10, 1), (0, 1)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "There are no integer values for `n` on the interval..." shape_bounds = [(1.4, 1.6), (0, 1)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds) message = "The intersection of user-provided bounds for `n`" with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data) shape_bounds = [(-np.inf, np.inf), (0, 1)] with pytest.raises(ValueError, match=message): stats.fit(self.dist, self.data, shape_bounds)