def test_case_control(): configuration = { 'n': 20000, 'bias': [-3, 0, 0], 'positive_sampling_weight': 10, 's': 200, 'seed': 3, 'link': special.expit, 'linkln': kernels.expitln, } data = kernels.simulate_data(**configuration) log_likelihood = ft.partial( kernels.evaluate_case_control_log_likelihood, x=data['x_observed'], y=data['y_observed'], prevalence=data['marginal'], linkln=data['linkln'], ) # Find the maximum likelihood estimate theta = data['theta'] x0 = np.random.normal(size=theta.shape) # Use mean aggregation to avoid https://stackoverflow.com/a/54446479/1150961 result = optimize.minimize(lambda x: -log_likelihood(x, aggregate='mean'), x0) assert result.success, 'optimization failed' # Draw samples and plot them cov = 2.4**2 * result.hess_inv / (len(data['x_observed']) * len(theta)) xs, values = kernels.sample(log_likelihood, result.x, cov, 5000) _plot_inference(theta, xs, result, filename='tests/~case_control.png')
def test_network_inference(): # Sample data configuration = { 'n': 2000, 'bias': [-7, 0, 0], 'link': special.expit, 'linkln': kernels.expitln, 's': 100, 'seed': 0, 'feature_map': kernels.l1_feature_map } data = kernels.simulate_network_data(**configuration) # Evaluate an estimate of the prevalence in the population k = len(data['pairs']) / data['s'] prevalence = k / (data['n'] - 1) # Get the features for cases i1, j1 = data['pairs'].T z = data['z'] x_cases = data['feature_map'](z[i1], z[j1]) # Sample controls and get their features i0, j0 = kernels.sample_controls(data['egos'], 3 * len(data['pairs'])) x_controls = data['feature_map'](z[i0], z[j0]) # Concatenate features and construct indicator variables x_observed = np.concatenate([x_cases, x_controls]) y_observed = np.concatenate( [np.ones(len(x_cases)), np.zeros(len(x_controls))]) log_likelihood = ft.partial( kernels.evaluate_case_control_log_likelihood, x=x_observed, y=y_observed, prevalence=prevalence, linkln=data['linkln'], ) # Find the maximum likelihood estimate theta = data['theta'] x0 = np.random.normal(size=theta.shape) # Use mean aggregation to avoid https://stackoverflow.com/a/54446479/1150961 result = optimize.minimize(lambda x: -log_likelihood(x, aggregate='mean'), x0) assert result.success, 'optimization failed' # Draw samples and plot them cov = 2.4**2 * result.hess_inv / (len(x_observed) * len(theta)) xs, values = kernels.sample(log_likelihood, result.x, cov, 5000) _plot_inference(theta, xs, result, filename='tests/~network_inference.png')
def test_sample_invalid_arguments(): def _log_dist(_): return 0 x = 0 cov = 1 num = 10 with pytest.raises(ValueError): kernels.sample(_log_dist, np.empty((1, 1)), cov, num) with pytest.raises(ValueError): kernels.sample(_log_dist, x, np.empty((1, 1, 1)), num) with pytest.raises(ValueError): kernels.sample(_log_dist, x, np.empty((1, 2)), num)
def test_sample(): xs, values = kernels.sample(lambda x: - x ** 2 / 2, 0, 1, 10000) assert abs(np.mean(xs)) < 0.1 assert abs(np.std(xs) - 1) < 0.1
x0) assert result.success, result.message # Evaluation based on numdifftools (should be more accurate) cov = -np.linalg.inv(ndt.Hessian(log_posterior)(result.x)) logging.info('maximised posterior in %d function evaluations', result.nfev) logging.info('MAP estimate: %s', dict(zip(feature_names, result.x))) logging.info('approximate marginal std: %s', dict(zip(feature_names, np.sqrt(np.diag(cov))))) # Draw samples from the log-posterior ---------------------------------------------------------- # Use the inverse Hessian from the optimisation to construct an approximate "optimal" proposal # covariance following A. Gelman, G. O. Roberts, W. R. Gilks. "Efficient Metropolis jumping # rules". (1996) proposal_cov = 2.4**2 * cov / len(result.x) xs, values = kernels.sample(log_posterior, result.x, proposal_cov, args.num_samples) acceptance = kernels.evaluate_acceptance(values) logging.info('obtained %d posterior samples with acceptance %.3f', args.num_samples, acceptance) logging.info('posterior mean: %s', dict(zip(feature_names, np.mean(xs, axis=0)))) logging.info('posterior std: %s', dict(zip(feature_names, np.std(xs, axis=0)))) if 'theta' in data: logging.info('true values: %s', dict(zip(feature_names, data['theta']))) residuals = np.mean(xs, axis=0) - data['theta'] logging.info('z-scores: %s', dict(zip(feature_names, residuals / np.std(xs, axis=0)))) cov_ = np.cov(xs.T) chi2 = residuals.dot(np.linalg.inv(cov_)).dot(residuals)