def lhs(sample_size: int, dimensions: int, seed: Optional[int] = None) -> np.ndarray: """ Latin Hypercube Sample design. Generate n stratified samples in d dimensions by drawing samples from a latin hypercube. 'lhs' is faster than both 'mdurs' and 'optimised_lhs' but has less consistent uniformity properties, especially in higher numbers of dimensions. Parameters ---------- sample_size: int Number of requested sample points dimensions: int Number of dimensions to sample in seed: {None, int}, optional Seed for numpy's random state. If None, an arbitrary seed is generated. Default = None. Returns ------- sample: ndarray (sample_size, dimensions) array of n sample points in d dimensions. Results are scaled on [0,1] by default. See Also -------- mdurs optimised_lhs """ # pylint: disable=no-member set_seed(seed) slices = np.linspace(0, 1, sample_size + 1) urnd = np.random.random((sample_size, dimensions)) lower = slices[:sample_size] upper = slices[1:] points = np.empty((sample_size, dimensions), order='C', dtype=np.float64) sample = np.empty((sample_size, dimensions), order='C', dtype=np.float64) for j in range(dimensions): points[:, j] = urnd[:, j] * (upper - lower) + lower index = np.random.permutation(range(sample_size)) sample[:, j] = points[index, j] return sample
def urandom(sample_size: int, dimensions: int, seed: Optional[int] = None): """ Uniform random sample. Parameters ---------- n : int Number of random vectors to draw. d : int Dimension of the random vectors. Returns ------- urvs : ndarray (n, d) array of n random d-dimensional vectors drawn uniformly at random. """ # pylint: disable=no-member set_seed(seed) return np.random.random((sample_size, dimensions))
def solve_inv_gamma( lower_bound: float, upper_bound: float, lower_tol: float, upper_tol: float, gridsize: int = 10000, max_attempts: int = 3, seed: Optional[int] = None, ) -> Tuple[float, float]: # pylint: disable= too-many-arguments, too-many-locals, no-member """ Solve system of equations to find appropriate inverse gamma parameters. Aims to identify parameters alpha and beta such that: * A total of lb_tol probability mass lies < lb * A total of ub_tol probability mass lies > ub Given an inverse gamma distribution parametrised by alpha and beta. Scipy's root finding module scipy.optimize.root is used to solve the above system of equations, following a preliminary grid search used to identify a suitable starting point. Parameters ---------- lower_bound: float Lower bound. upper_bound: float Upper bound. lower_tol: float Lower bound tolerance. upper_tol: float Upper bound tolerance. gridsize: int, optional Size of grid used for preliminary grid search. max_attempts: int, optional Maximum number of attempts permitted. seed: {None, int32} Seed for numpy's random state. If None, an arbitrary random seed will be used. Default = None. Returns ------- alpha: float Inverse gamma parameter alpha. beta: float Inverse gamma parameter beta. """ utils.set_seed(seed) if lower_bound >= upper_bound: raise ValueError('Lower bound must be smaller than upper bound.') obj = create_objective(lower_bound, upper_bound, lower_tol, upper_tol) attempts = 1 converged = False scales = np.array([10, 10]) obj_grid = np.empty((gridsize, 2)) while not converged: theta_grid = np.random.random((gridsize, 2)) * scales # objective function is not vectorised, so run in loop... for i in range(gridsize): obj_grid[i, :] = obj(theta_grid[i, :]) obj_grid_norm = np.sqrt(np.sum(obj_grid**2, axis=1)) theta0 = theta_grid[obj_grid_norm.argmin(), :] theta_sol = optimize.root(obj, theta0) converged = theta_sol['success'] if attempts > max_attempts: raise RuntimeError( 'Maximum number of attempts exceeded without convergence.') return theta_sol['x'][0], theta_sol['x'][1]
def sobol( sample_size: int, dimensions: int, seed: Optional[int] = None, generator_seed: int = 1, skip: int = 0, ) -> np.ndarray: """ Generate n length d quasi-random vectors from the Sobol sequence. Generate n length d quasi-random vectors using the Sobol sequence [1]_. Implements i4sobol_generate from the sobol_seq package but without the ability to change the dimension of the sequence after it has been initialised. Parameters ---------- sample_size: int The number of random vectors to retrieve. dimensions: int The dimension of the random vectors seed: {None, int32} Seed for numpy's random state. If None, an arbitrary seed will be used. Default = None. generator_seed : int Seed for the Sobol sequence generator. Default = 1. skip: int Skip every this number of generated points. Default = 0. Returns ------- samples : ndarray (sample_size, dimensions) array consisting of the requested number of quasi-random vectors scaled between [0, 1] in the requested number of dimensions. Notes ----- This function implements i4sobol_generate from the sobol_seq using a generator instead of global variable declarations. See the original source code at https://github.com/naught101/sobol_seq for more details, and the documentation of i4_sobol2 for a full list of references. TODO: implement ability to resume sampling from an existing Sobol sequence. References ---------- [1] Sobol, I.M., 1976. Uniformly distributed sequences with an additional uniform property. USSR Computational Mathematics and Mathematical Physics, 16(5), pp.236-242. See Also -------- sobol_scatter : Sobol sequence with additive randomisation. i4_sobol2 : Sobol sequence generator. """ set_seed(seed) sample = np.empty((sample_size, dimensions), dtype=np.float64) seq_generator = i4_sobol2(dimensions, generator_seed=generator_seed, skip=skip) for j in range(sample_size): sample[j, :] = six.next(seq_generator) return sample
def optimised_lhs( sample_size: int, dimensions: int, iterations: int = 100, measure: str = 'euclidean', criteria: Union[str, ta.LhsCriteria] = 'maximin', options: Optional[Mapping[str, Any]] = None, seed: Optional[int] = None, ) -> np.ndarray: """Optimised Latin Hypercube Sample design. Pick a sample from a collection of latin hypercube designs maximising a specified criteria, nominally the 'maximin' criteria of Morris and Mitchell [1]_. optimised_lhs generates a large number of lhs designs, then selects from this pool the design best satisfying a specified criteria, which is a function of a specified distance measure (or 'metric' - though 'metric' is not necessarily a metric in the mathematical sense). A valid distance measure is any supported by scipy's cdist, of which typical choices are: * 'cityblock' : L1 distance * 'eculidean' : L2 distance * 'sqeuclidean' : squared L2 distance Currently supported comparison criteria are: * 'maximin' [1]_. Parameters ---------- sample_size: int Number of requested sample points dimensions: int Number of dimensions iterations: int, optional The number of individual designs to compare. The design maximising 'criteria' after the requested number of iterations will be returned. Default = 100. measure: str, optional Distance measure to be used for comparing designs. References one of the measures compatible with scipy's spatial.distance.cdist function. Default = 'euclidean'. criteria: {str, callable}, optional Comparison criteria: * 'maximin' - maximin criteria. * callable - user supplied function; see below. Returns ------- sample: ndarray (sample_size, dimensions) array of n sample points in d dimensions. Results are scaled on [0,1]. Notes ----- A user supplied function can be used as a comparison criteria. The supplied function accepts a vector of pairwise distances calculated using 'measure'. The function should return a quantity intended to be maximised. References ---------- [1] Morris, M.D. and Mitchell, T.J., 1995. Exploratory designs for computational experiments. Journal of statistical planning and inference, 43(3), pp.381-402. See Also -------- scipy.spatial.distance.cdist """ # pylint: disable=too-many-arguments, too-many-locals, no-member set_seed(seed) if options is None: options = {} slices = np.linspace(0, 1, sample_size + 1) lower = slices[:sample_size] upper = slices[1:] indices_list = np.arange(sample_size) points = np.empty((sample_size, dimensions), order='C', dtype=np.float64) sample = np.empty((sample_size, dimensions), order='C', dtype=np.float64) tmp = -np.inf for _ in range(iterations): urnd = np.random.random((sample_size, dimensions)) for j in range(dimensions): points[:, j] = urnd[:, j] * (upper - lower) + lower index = np.random.permutation(indices_list) sample[:, j] = points[index, j] delta = eval_criteria(sample, measure, criteria, options) if delta > tmp: tmp = delta ret = sample.copy() # is this copy necessary? return ret
def mdurs( sample_size: int, dimensions: int, scale_factor: int = 10, nearest_k: int = 2, measure: str = 'cityblock', seed: Optional[int] = None, ) -> np.ndarray: """ Multi-Dimensionally Uniform Random Sample. Implements the "LHSMDU" algorithm of Deutsch and Deutsch [1]_. mdurs is suited to randomised designs of low (n < 50) numbers of samples. Though it can be used for larger n, runtime may become an issue as the algorithm iterates over individual sample points in a canidate pool rather than the sample designs themselves. mdurs uses one of scipy's distance measures to maximise dispersion between points. Valid distance measures are any supported by scipy.spatial.distance.cdist, of which typical choices are: * 'cityblock' : L1 distance * 'eculidean' : L2 distance * 'sqeuclidean' : squared L2 distance Parameters ---------- sample_size: int Number of requested sample points dimensions: int Number of dimensions scale_factor : int, optional Scale factor (default = 10). You should not need to change this; see [1]_. nearest_k: int, optional Number of neighbours used to compute moving average (default = 2). You should not need to change this; see [1]_. measure: string, optional Distance measure to be used. Passed as a method argument to scipy's spatial.distance.cdist function. Default = 'cityblock'. seed: {None, int}, optional Seed for numpy's random state. If None, an arbitrary seed is generated. Default = None. Returns ------- random_sample: ndarray (sample_size, dimensions) array of n sample points in d dimensions. Results are scaled on [0,1]. Notes ----- This algorithm is unusably slow for large n. For n > 50 it is recommended to use one of the other sampling algorithms unless the time required to generate the sample points is less important than a highly uniform random sample. References ---------- [1] Deutsch, J.L. and Deutsch, C.V., 2012. Latin hypercube sampling with multidimensional uniformity. Journal of Statistical Planning and Inference, 142(3), pp.763-772. See Also -------- lhs optimised_lhs scipy.spatial.distance.cdist """ # pylint: disable=no-member set_seed(seed) n_pool = scale_factor * sample_size random_sample = np.random.random((n_pool, dimensions)) while random_sample.shape[0] > sample_size: len_s = random_sample.shape[0] distance_matrix = cdist(random_sample, random_sample, metric=measure) ret = np.empty(len_s, dtype=np.float64, order='C') for i in range(len_s): ret[i] = np.mean(np.sort(distance_matrix[i, :])[1:1 + nearest_k]) random_sample = np.delete(random_sample, np.argmin(ret), axis=0) return random_sample