def test_gp_regression_with_warping(): def f(x): return np.sin(3 * np.log(x)) np.random.seed(7) L, U = -5., 12. input_range = (2.**L, 2.**U) x_train = np.sort(2.**np.random.uniform(L, U, 250)) x_test = np.sort(2.**np.random.uniform(L, U, 500)) y_train = f(x_train) y_test = f(x_test) # to mx.nd y_train_mx_nd = mx.nd.array(y_train) x_train_mx_nd = mx.nd.array(x_train) x_test_mx_nd = mx.nd.array(x_test) kernels = [ Matern52(dimension=1), WarpedKernel(kernel=Matern52(dimension=1), warping=Warping(dimension=1, index_to_range={0: input_range})) ] models = [ GaussianProcessRegression(kernel=k, random_seed=0) for k in kernels ] train_errors, test_errors = [], [] for model in models: model.fit(x_train_mx_nd, y_train_mx_nd) mu_train, var_train = model.predict(x_train_mx_nd)[0] mu_test, var_test = model.predict(x_test_mx_nd)[0] # back to np.array mu_train = mu_train.asnumpy() mu_test = mu_test.asnumpy() # var_train = var_train.asnumpy() # var_test = var_test.asnumpy() train_errors.append(np.mean(np.abs((mu_train - y_train)))) test_errors.append(np.mean(np.abs((mu_test - y_test)))) # The two models have similar performance on training points np.testing.assert_almost_equal(train_errors[0], train_errors[1], decimal=4) # As expected, the model with warping largely outperforms the model without assert test_errors[1] < 0.1 * test_errors[0]
def test_likelihood_encoding(): mean = ScalarMeanFunction() kernel = Matern52(dimension=1) likelihood = MarginalLikelihood(mean=mean, kernel=kernel) assert isinstance(likelihood.encoding, LogarithmScalarEncoding) likelihood = MarginalLikelihood(mean=mean, kernel=kernel, encoding_type="positive") assert isinstance(likelihood.encoding, PositiveScalarEncoding)
def build_kernel(state: TuningJobState, do_warping: bool = False) -> KernelFunction: dims, warping_ranges = dimensionality_and_warping_ranges(state.hp_ranges) kernel = Matern52(dims, ARD=True) if do_warping: return WarpedKernel(kernel=kernel, warping=Warping(dims, warping_ranges)) else: return kernel
def resource_kernel_factory( name: str, kernel_x: KernelFunction, mean_x: gluon.HybridBlock, max_metric_value: float) -> (KernelFunction, gluon.HybridBlock): """ Given kernel function kernel_x and mean function mean_x over config x, create kernel and mean functions over (x, r), where r is the resource attribute (nonnegative scalar, usually in [0, 1]). :param name: Selects resource kernel type :param kernel_x: Kernel function over configs x :param mean_x: Mean function over configs x :return: res_kernel, res_mean, both over (x, r) """ if name == 'matern52': res_kernel = Matern52(dimension=kernel_x.dimension + 1, ARD=True) res_mean = mean_x elif name == 'matern52-res-warp': # Warping on resource dimension (last one) dim_x = kernel_x.dimension res_warping = Warping(dimension=dim_x + 1, index_to_range={dim_x: (0., 1.)}) res_kernel = WarpedKernel(kernel=Matern52(dimension=dim_x + 1, ARD=True), warping=res_warping) res_mean = mean_x else: if name == 'exp-decay-sum': delta_fixed_value = 0.0 elif name == 'exp-decay-combined': delta_fixed_value = None elif name == 'exp-decay-delta1': delta_fixed_value = 1.0 else: raise AssertionError("name = '{}' not supported".format(name)) res_kernel = ExponentialDecayResourcesKernelFunction( kernel_x, mean_x, gamma_init=0.5 * max_metric_value, delta_fixed_value=delta_fixed_value, max_metric_value=max_metric_value) res_mean = ExponentialDecayResourcesMeanFunction(kernel=res_kernel) return res_kernel, res_mean
def test_set_gp_hps(): mean = ScalarMeanFunction() kernel = Matern52(dimension=1) warping = Warping(dimension=1, index_to_range={0: (-4., 4.)}) warped_kernel = WarpedKernel(kernel=kernel, warping=warping) likelihood = MarginalLikelihood(kernel=warped_kernel, mean=mean, initial_noise_variance=1e-6) likelihood.initialize(ctx=mx.cpu(), force_reinit=True) likelihood.hybridize() hp_values = np.array([1e-2, 1.0, 0.5, 0.3, 0.2, 1.1]) _set_gp_hps(hp_values, likelihood) np.testing.assert_array_almost_equal(hp_values, _get_gp_hps(likelihood))
def test_incremental_update(): def f(x): return np.sin(x) / x np.random.seed(298424) std_noise = 0.01 for rep in range(10): model = GaussianProcessRegression(kernel=Matern52(dimension=1)) # Sample data num_train = np.random.randint(low=5, high=15) num_incr = np.random.randint(low=1, high=7) sizes = [num_train, num_incr] features = [] targets = [] for sz in sizes: feats = np.random.uniform(low=-1.0, high=1.0, size=sz).reshape((-1, 1)) features.append(feats) targs = f(feats) targs += np.random.normal(0.0, std_noise, size=targs.shape) targets.append(targs) # Posterior state by incremental updating train_features = to_nd(features[0]) train_targets = to_nd(targets[0]) model.fit(train_features, train_targets) noise_variance_1 = model.likelihood.get_noise_variance() state_incr = IncrementalUpdateGPPosteriorState( features=train_features, targets=train_targets, mean=model.likelihood.mean, kernel=model.likelihood.kernel, noise_variance=model.likelihood.get_noise_variance(as_ndarray=True)) for i in range(num_incr): state_incr = state_incr.update( to_nd(features[1][i].reshape((1, -1))), to_nd(targets[1][i].reshape((1, -1)))) noise_variance_2 = state_incr.noise_variance.asscalar() # Posterior state by direct computation state_comp = GaussProcPosteriorState( features=to_nd(np.concatenate(features, axis=0)), targets=to_nd(np.concatenate(targets, axis=0)), mean=model.likelihood.mean, kernel=model.likelihood.kernel, noise_variance=state_incr.noise_variance) # Compare them assert noise_variance_1 == noise_variance_2, \ "noise_variance_1 = {} != {} = noise_variance_2".format( noise_variance_1, noise_variance_2) chol_fact_incr = state_incr.chol_fact.asnumpy() chol_fact_comp = state_comp.chol_fact.asnumpy() np.testing.assert_almost_equal(chol_fact_incr, chol_fact_comp, decimal=2) pred_mat_incr = state_incr.pred_mat.asnumpy() pred_mat_comp = state_comp.pred_mat.asnumpy() np.testing.assert_almost_equal(pred_mat_incr, pred_mat_comp, decimal=2)
def test_get_gp_hps(): mean = ScalarMeanFunction() kernel = Matern52(dimension=1) warping = Warping(dimension=1, index_to_range={0: (-4., 4.)}) warped_kernel = WarpedKernel(kernel=kernel, warping=warping) likelihood = MarginalLikelihood(kernel=warped_kernel, mean=mean, initial_noise_variance=1e-6) likelihood.initialize(ctx=mx.cpu(), force_reinit=True) likelihood.hybridize() hp_values = _get_gp_hps(likelihood) # the oder of hps are noise, mean, covariance scale, bandwidth, warping a, warping b np.testing.assert_array_almost_equal( hp_values, np.array([1e-6, 0.0, 1.0, 1.0, 1.0, 1.0]))
def test_gp_regression_2d_with_ard(): def f(x): # Only dependent on the first column of x return np.sin(x[:,0])/x[:,0] np.random.seed(7) dimension = 3 # 30 train and test points in R^3 x_train = np.random.uniform(-5, 5, size=(30,dimension)) x_test = np.random.uniform(-5, 5, size=(30,dimension)) y_train = f(x_train) y_test = f(x_test) # to mx.nd y_train_mx_nd = mx.nd.array(y_train) x_train_mx_nd = mx.nd.array(x_train) x_test_mx_nd = mx.nd.array(x_test) model = GaussianProcessRegression(kernel=Matern52(dimension=dimension, ARD=True)) model.fit(x_train_mx_nd, y_train_mx_nd) # Check that the value of the residual noise variance learned by empirical Bayes is in the same order as the smallest allowed value (since there is no noise) noise_variance = model.likelihood.get_noise_variance() np.testing.assert_almost_equal(noise_variance, NOISE_VARIANCE_LOWER_BOUND) # Check that the bandwidths learned by empirical Bayes reflect the fact that only the first column is useful # In particular, for the useless dimensions indexed by {1,2}, the inverse bandwidths should be close to INVERSE_BANDWIDTHS_LOWER_BOUND # (or conversely, bandwidths should be close to their highest allowed values) sqd = model.likelihood.kernel.squared_distance inverse_bandwidths = sqd.encoding.get(mx.nd, sqd.inverse_bandwidths_internal.data()).asnumpy() assert inverse_bandwidths[0] > inverse_bandwidths[1] and inverse_bandwidths[0] > inverse_bandwidths[2] np.testing.assert_almost_equal(inverse_bandwidths[1], INVERSE_BANDWIDTHS_LOWER_BOUND) np.testing.assert_almost_equal(inverse_bandwidths[2], INVERSE_BANDWIDTHS_LOWER_BOUND) mu_train, _ = model.predict(x_train_mx_nd)[0] mu_test, _ = model.predict(x_test_mx_nd)[0] # back to np.array mu_train = mu_train.asnumpy() mu_test = mu_test.asnumpy() np.testing.assert_almost_equal(mu_train, y_train, decimal=2) # Fewer decimals imposed for the test points np.testing.assert_almost_equal(mu_test, y_test, decimal=1)
def fit_predict_ours(data: dict, random_seed: int, optimization_config: OptimizationConfig, test_intermediates: Optional[dict] = None) -> dict: # Create surrogate model num_dims = len(data['ss_limits']) _gpmodel = GaussianProcessRegression( kernel=Matern52(num_dims, ARD=True), mean=ZeroMeanFunction(), # Instead of ScalarMeanFunction optimization_config=optimization_config, random_seed=random_seed, test_intermediates=test_intermediates) model = GPMXNetModel(data['state'], DEFAULT_METRIC, random_seed, _gpmodel, fit_parameters=True, num_fantasy_samples=20) model_params = model.get_params() print('Hyperparameters: {}'.format(model_params)) # Prediction means, stddevs = model.predict(data['test_inputs'])[0] return {'means': means, 'stddevs': stddevs}
def test_gp_regression_with_noise(): def f(x): return np.sin(x)/x np.random.seed(7) x_train = np.arange(-5, 5, 0.2)# [-5, -4.8, -4.6,..., 4.8] x_test = np.arange(-4.9, 5, 0.2)# [-4.9, -4.7, -4.5,..., 4.9], note that train and test points do not overlap y_train = f(x_train) y_test = f(x_test) std_noise = 0.01 noise_train = np.random.normal(0.0, std_noise,size=y_train.shape) # to mx.nd y_train_mx_nd = mx.nd.array(y_train) noise_train_mx_nd = mx.nd.array(noise_train) x_train_mx_nd = mx.nd.array(x_train) x_test_mx_nd = mx.nd.array(x_test) model = GaussianProcessRegression(kernel=Matern52(dimension=1)) model.fit(x_train_mx_nd, y_train_mx_nd + noise_train_mx_nd) # Check that the value of the residual noise variance learned by empirical Bayes is in the same order as std_noise^2 noise_variance = model.likelihood.get_noise_variance() np.testing.assert_almost_equal(noise_variance, std_noise**2, decimal=4) mu_train, _ = model.predict(x_train_mx_nd)[0] mu_test, _ = model.predict(x_test_mx_nd)[0] # back to np.array mu_train = mu_train.asnumpy() mu_test = mu_test.asnumpy() np.testing.assert_almost_equal(mu_train, y_train, decimal=2) np.testing.assert_almost_equal(mu_test, y_test, decimal=2)
def test_gp_regression_no_noise(): def f(x): return np.sin(x)/x x_train = np.arange(-5, 5, 0.2)# [-5,-4.8,-4.6,...,4.8] x_test = np.arange(-4.9, 5, 0.2)# [-4.9, -4.7, -4.5,...,4.9], note that train and test points do not overlap y_train = f(x_train) y_test = f(x_test) # to mx.nd y_train_mx_nd = mx.nd.array(y_train) x_train_mx_nd = mx.nd.array(x_train) x_test_mx_nd = mx.nd.array(x_test) model = GaussianProcessRegression(kernel=Matern52(dimension=1)) model.fit(x_train_mx_nd, y_train_mx_nd) # Check that the value of the residual noise variance learned by empirical Bayes is in the same order # as the smallest allowed value (since there is no noise) noise_variance = model.likelihood.get_noise_variance() np.testing.assert_almost_equal(noise_variance, NOISE_VARIANCE_LOWER_BOUND) mu_train, var_train = model.predict(x_train_mx_nd)[0] mu_test, var_test = model.predict(x_test_mx_nd)[0] # back to np.array mu_train = mu_train.asnumpy() mu_test = mu_test.asnumpy() var_train = var_train.asnumpy() var_test = var_test.asnumpy() np.testing.assert_almost_equal(mu_train, y_train, decimal=4) np.testing.assert_almost_equal(var_train, [0.0] * len(var_train), decimal=4) # Fewer decimals imposed for the test points np.testing.assert_almost_equal(mu_test, y_test, decimal=3)
def _create_common_objects(**kwargs): # TODO: Validity checks on kwargs arguments scheduler = kwargs['scheduler'] config_space = kwargs['configspace'] is_hyperband = scheduler.startswith('hyperband') if kwargs.get('debug_use_hyperparameter_ranges', False): assert isinstance(config_space, HyperparameterRanges) assert not is_hyperband, \ "Cannot use debug_use_hyperparameter_ranges with Hyperband scheduling" hp_ranges_cs = config_space else: import ConfigSpace as CS assert isinstance(config_space, CS.ConfigurationSpace) hp_ranges_cs = HyperparameterRanges_CS(config_space) # Note: This base random seed is used to create different random seeds for # each BO get_config call internally random_seed = kwargs.get('random_seed', 31415927) # Skip optimization predicate for GP surrogate model if kwargs.get('opt_skip_num_max_resource', False) and is_hyperband: skip_optimization = SkipNoMaxResourcePredicate( init_length=kwargs['opt_skip_init_length'], resource_attr_name=kwargs['resource_attribute'], max_resource=kwargs['max_epochs']) elif kwargs.get('opt_skip_period', 1) > 1: skip_optimization = SkipPeriodicallyPredicate( init_length=kwargs['opt_skip_init_length'], period=kwargs['opt_skip_period']) else: skip_optimization = None # Profiler if kwargs.get('profiler', False): profiler = GPMXNetSimpleProfiler() else: profiler = None # Conversion from reward to metric (strictly decreasing) and back _map_reward = kwargs.get('map_reward', '1_minus_x') if isinstance(_map_reward, str): _map_reward_name = _map_reward supp_map_reward = {'1_minus_x', 'minus_x'} assert _map_reward_name in supp_map_reward, \ "This factory needs map_reward in {}".format(supp_map_reward) _map_reward: MapReward = map_reward( const=1.0 if _map_reward_name == '1_minus_x' else 0.0) else: assert isinstance(_map_reward, MapReward), \ "map_reward must either be string or of MapReward type" if is_hyperband: # Note: 'min_reward' is needed only to support the exp-decay # surrogate model. If not given, it is assumed to be 0. min_reward = kwargs.get('min_reward', 0) max_metric_value = _map_reward(min_reward) else: max_metric_value = None opt_warmstart = kwargs.get('opt_warmstart', False) # Underlying GP regression model kernel = Matern52(dimension=hp_ranges_cs.ndarray_size(), ARD=True) mean = ScalarMeanFunction() if is_hyperband: kernel, mean = resource_kernel_factory( kwargs['gp_resource_kernel'], kernel_x=kernel, mean_x=mean, max_metric_value=max_metric_value) optimization_config = OptimizationConfig( lbfgs_tol=DEFAULT_OPTIMIZATION_CONFIG.lbfgs_tol, lbfgs_maxiter=kwargs['opt_maxiter'], verbose=kwargs['opt_verbose'], n_starts=kwargs['opt_nstarts']) debug_writer = None if kwargs.get('opt_debug_writer', False): fname_msk = kwargs.get('opt_debug_writer_fmask', 'debug_gpr_{}') debug_writer = DebugGPRegression( fname_msk=fname_msk, rolling_size=5) gpmodel = GaussianProcessRegression( kernel=kernel, mean=mean, optimization_config=optimization_config, fit_reset_params=not opt_warmstart, debug_writer=debug_writer) model_args = GPMXNetModelArgs( num_fantasy_samples=kwargs['num_fantasy_samples'], random_seed=random_seed, active_metric=DEFAULT_METRIC, normalize_targets=True) debug_log = DebugLogPrinter() if kwargs.get('debug_log', False) else None return hp_ranges_cs, random_seed, gpmodel, model_args, profiler, \ _map_reward, skip_optimization, debug_log
def build_kernel(): return WarpedKernel(kernel=Matern52(dimension=1), warping=Warping(dimension=1, index_to_range={0: (-4., 4.)}))