def test_multivariate_state_std_gaussian(): np.random.seed(88) n_dims = 5 n_outs = 3 mu_approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) std_approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) pi = StateStdGaussianPolicy(mu_approximator, std_approximator) weights = np.random.rand(pi.weights_size) + .1 pi.set_weights(weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def multivariate_state_std_gaussian(): print('Testing multivariate state std gaussian policy...') n_dims = 5 n_outs = 3 std = np.random.randn(n_outs) approximator_params = dict(input_dim=n_dims) mu_approximator = Regressor(LinearApproximator, input_shape=(n_dims,), output_shape=(n_outs,), params=approximator_params) std_approximator = Regressor(LinearApproximator, input_shape=(n_dims,), output_shape=(n_outs,), params=approximator_params) pi = StateStdGaussianPolicy(mu_approximator, std_approximator) mu_weights = np.random.rand(pi.weights_size)+0.1 pi.set_weights(mu_weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def test_deterministic_policy(): np.random.seed(88) n_dims = 5 approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(2, )) pi = DeterministicPolicy(approximator) w_new = np.random.rand(pi.weights_size) w_old = pi.get_weights() pi.set_weights(w_new) assert np.array_equal(w_new, approximator.get_weights()) assert not np.array_equal(w_old, w_new) assert np.array_equal(w_new, pi.get_weights()) s_test_1 = np.random.randn(5) s_test_2 = np.random.randn(5) a_test = approximator.predict(s_test_1) assert pi.get_regressor() == approximator assert pi(s_test_1, a_test) == 1 assert pi(s_test_2, a_test) == 0 a_stored = np.array([-1.86941072, -0.1789696]) assert np.allclose(pi.draw_action(s_test_1), a_stored)
def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, clip_reward=True, weighted_update=False, update_type='weighted', delta=0.1, q_max=100, store_prob=False, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.weighted_update = weighted_update self.update_type = update_type self.q_max = q_max self.store_prob = store_prob self.max_spread = max_spread quantiles = [ i * 1. / (n_approximators - 1) for i in range(n_approximators) ] for p in range(n_approximators): if quantiles[p] >= 1 - delta: self.delta_index = p break self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(ParticleDQN, self).__init__(policy, mdp_info)
def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, approximator_params, target_update_frequency, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; approximator_params (dict): parameters of the approximator to build; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info)
def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, train_frequency, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, p_mask=2 / 3., dtype=np.float32, weighted_update=False): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency // train_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._p_mask = p_mask self.weighted_update = weighted_update self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, n_approximators, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info)
def test_multivariate_gaussian(): np.random.seed(88) n_dims = 5 n_outs = 3 random_matrix = np.random.rand(n_outs, n_outs) sigma = random_matrix.dot(random_matrix.T) approximator = Regressor(LinearApproximator, input_shape=(n_dims, ), output_shape=(n_outs, )) pi = GaussianPolicy(approximator, sigma) mu_weights = np.random.rand(pi.weights_size) pi.set_weights(mu_weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def univariate_gaussian(): print('Testing univariate gaussian policy...') sigma = 1e-3*np.eye(1) n_dims = 5 approximator_params = dict(input_dim=n_dims) approximator = Regressor(LinearApproximator, input_shape=(n_dims,), output_shape=(1,), params=approximator_params) pi = GaussianPolicy(approximator, sigma) mu_weights = np.random.rand(pi.weights_size) pi.set_weights(mu_weights) x = np.random.randn(20, n_dims) for x_i in x: state = np.atleast_1d(x_i) action = pi.draw_action(state) exact_diff = pi.diff(state, action) numerical_diff = numerical_diff_policy(pi, state, action) assert np.allclose(exact_diff, numerical_diff)
def build_low_level_ghavamzadeh(alg, params, mdp): # FeaturesL high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 10] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 3 tilingsL = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) featuresL = Features(tilings=tilingsL) mdp_info_agentL = MDPInfo(observation_space=spaces.Box( low=np.array([0, 0]), high=np.array([150, 150]), shape=(2, )), action_space=mdp.info.action_space, gamma=0.99, horizon=10000) input_shape = (featuresL.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) std = np.array([3e-2]) pi = DiagonalGaussianPolicy(mu=approximator, std=std) agent = alg(pi, mdp_info_agentL, features=featuresL, **params) return agent
def experiment(alg, params, n_epochs, fit_per_run, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 1e-3 * np.eye(policy.weights_size) distribution = GaussianCholeskyDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, **params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=fit_per_run * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def build_high_level_agent(alg, params, mdp, mu, sigma): features = Features(basis_list=[PolynomialBasis()]) approximator = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator.set_weights(mu) pi1 = DiagonalGaussianPolicy(mu=approximator, std=sigma) lim = mdp.info.observation_space.high[0] mdp_info_agent = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=1.0, horizon=100) agent = alg(pi1, mdp_info_agent, features=features, **params) return agent
def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, clip_reward=True, update_type='weighted', delta=0.1, store_prob=False, q_max=100, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.update_type = update_type self.delta = delta self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1) self.store_prob = store_prob self.q_max = q_max self.max_spread = max_spread self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._epsilon = 1e-7 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(GaussianDQN, self).__init__(policy, mdp_info)
def experiment(alg, params, experiment_params ,subdir, i): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) #sigma = np.array([[1e-4]]) std = np.array([3e-2]) policy = DiagonalGaussianPolicy(mu=approximator, std=std) #policy = GaussianPolicy(mu=approximator, sigma=sigma) # Agent agent = alg(policy, mdp.info, features=phi, **params) # Train parameter_dataset = CollectPolicyParameter(policy) core = Core(agent, mdp, callbacks=[parameter_dataset]) dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=ep_per_run) # print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) dataset_eval += dataset_eval_run print('J at start : ' + str(np.mean(J))) for n in range(n_runs): print('ITERATION :', n) core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(i)) np.save(subdir+str(i)+'/dataset_eval_file', dataset_eval) np.save(subdir+str(i)+'/parameter_dataset_file', parameter_dataset)
def experiment(alg, n_runs, n_iterations, ep_per_run, use_tensorflow): np.random.seed() # MDP mdp = ShipSteering() # Policy if use_tensorflow: tensor_list = gaussian_tensor.generate( [3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) else: basis = GaussianRBF.generate([3, 3, 6, 2], [[0., 150.], [0., 150.], [-np.pi, np.pi], [-np.pi / 12, np.pi / 12]]) phi = Features(basis_list=basis) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.array([[.05]]) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def experiment(alg, params, subdir, exp_no): np.random.seed() # MDP mdp = ShipSteering(small=True, n_steps_action=3) high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator_params = dict(input_dim=input_shape) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) policy = DeterministicPolicy(mu=approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train dataset_eval = list() core = Core(agent, mdp) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) #print('distribution parameters: ', distribution.get_parameters()) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval_run = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) dataset_eval += dataset_eval_run mk_dir_recursive('./' + subdir + str(exp_no)) np.save(subdir + str(exp_no) + '/dataset_eval_file', dataset_eval)
def experiment(alg, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma_weights = 2 * np.ones(sigma.weights_size) sigma.set_weights(sigma_weights) policy = StateStdGaussianPolicy(approximator, sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) agent = alg(policy, mdp.info, **algorithm_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print('policy parameters: ', policy.get_weights()) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, clip_reward=True, p_mask=2 / 3.): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._p_mask = p_mask self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._episode_steps = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(BootstrappedDQN, self).__init__(policy, mdp_info)
def __init__(self, approximator, policy, mdp_info, params): alg_params = params['algorithm_params'] self._batch_size = alg_params.get('batch_size') self._n_approximators = alg_params.get('n_approximators', 1) self._clip_reward = alg_params.get('clip_reward', True) self._train_frequency = alg_params.get('train_frequency') self._target_update_frequency = alg_params.get( 'target_update_frequency') self._max_no_op_actions = alg_params.get('max_no_op_actions', 0) self._no_op_action_value = alg_params.get('no_op_action_value', 0) self._replay_memory = ReplayMemory( mdp_info, alg_params.get('initial_replay_size'), alg_params.get('max_replay_size'), alg_params.get('history_length', 1)) self._buffer = Buffer(size=alg_params.get('history_length', 1)) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(params['approximator_params']) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(params['approximator_params']) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in xrange(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info, params)
def test_linear_approximator(): np.random.seed(88) noise = 1e-3 a = np.random.rand(1000, 3) k = np.random.rand(3, 2) b = a.dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(2, )) approximator.fit(a, b) khat = approximator.get_weights() deltaK = (khat - k.T.flatten()) assert np.linalg.norm(deltaK) < noise point = np.random.randn(3, ) derivative = approximator.diff(point) lp = len(point) for i in range(derivative.shape[1]): assert (derivative[i * lp:(i + 1) * lp, i] == point).all()
def test_pytorch_approximator(): np.random.seed(88) torch.manual_seed(88) noise = 1e-3**2 a = np.random.rand(1000, 4) k = np.random.rand(4, 2) b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(PyTorchApproximator, input_shape=(4, ), output_shape=(2, ), network=ExampleNet, optimizer={ 'class': optim.Adam, 'params': {} }, loss=F.mse_loss, n_neurons=100, n_hidden=1, n_epochs=200, batch_size=100, quiet=True) approximator.fit(a, b) bhat = approximator.predict(a) error = np.linalg.norm(b - bhat, 'fro') / 1000 error_inf = np.max(np.abs(b - bhat)) print(b[:10]) print(bhat[:10]) print(error_inf) assert error < 2e-4
def build_mid_level_agent(alg, params, mdp, mu, std): mu_approximator = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(2, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) pi = DiagonalGaussianPolicy(mu=mu_approximator, std=std * np.ones(2)) lim = mdp.info.observation_space.high[0] basis = PolynomialBasis() features = BasisFeatures(basis=[basis]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(0, 1, (1, )), action_space=spaces.Box(0, lim, (2, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def experiment(alg, params, n_epochs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = ShipSteering() # Policy high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 6] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high) phi = Features(tilings=tilings) input_shape = (phi.size, ) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) policy = DeterministicPolicy(approximator) mu = np.zeros(policy.weights_size) sigma = 4e-1 * np.ones(policy.weights_size) distribution = GaussianDiagonalDistribution(mu, sigma) # Agent agent = alg(distribution, policy, mdp.info, features=phi, **params) # Train print(alg.__name__) core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in range(n_epochs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J)))
def build_approximator(mdp): high = [150, 150, np.pi] low = [0, 0, -np.pi] n_tiles = [5, 5, 8] low = np.array(low, dtype=np.float) high = np.array(high, dtype=np.float) n_tilings = 1 tilings = Tiles.generate(n_tilings=n_tilings, n_tiles=n_tiles, low=low, high=high, uniform=True) phi = Features(tilings=tilings) input_shape = (phi.size,) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape) return phi, approximator
def build_high_level_agent(alg, params, mdp, mu, std): tilings = Tiles.generate(n_tilings=1, n_tiles=[10, 10], low=mdp.info.observation_space.low[:2], high=mdp.info.observation_space.high[:2]) features = Features(tilings=tilings) input_shape = (features.size, ) mu_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) std_approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=(1, )) w_mu = mu * np.ones(mu_approximator.weights_size) mu_approximator.set_weights(w_mu) w_std = std * np.ones(std_approximator.weights_size) mu_approximator.set_weights(w_std) pi = StateLogStdGaussianPolicy(mu=mu_approximator, log_std=std_approximator) obs_low = np.array( [mdp.info.observation_space.low[0], mdp.info.observation_space.low[1]]) obs_high = np.array([ mdp.info.observation_space.high[0], mdp.info.observation_space.high[1] ]) mdp_info_agent1 = MDPInfo(observation_space=spaces.Box(obs_low, obs_high, shape=(2, )), action_space=spaces.Box( mdp.info.observation_space.low[2], mdp.info.observation_space.high[2], shape=(1, )), gamma=1, horizon=10) agent = alg(policy=pi, mdp_info=mdp_info_agent1, features=features, **params) return agent
def experiment(alg, n_runs, n_iterations, ep_per_run): np.random.seed() # MDP mdp = LQR.generate(dimensions=1) approximator_params = dict(input_dim=mdp.info.observation_space.shape) approximator = Regressor(LinearApproximator, input_shape=mdp.info.observation_space.shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = .1 * np.eye(1) policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=.01) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = { 'algorithm_params': algorithm_params, 'fit_params': fit_params } agent = alg(policy, mdp.info, agent_params) # Train core = Core(agent, mdp) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) for i in xrange(n_runs): core.learn(n_episodes=n_iterations * ep_per_run, n_episodes_per_fit=ep_per_run) dataset_eval = core.evaluate(n_episodes=ep_per_run) print 'policy parameters: ', policy.get_weights() J = compute_J(dataset_eval, gamma=mdp.info.gamma) print('J at iteration ' + str(i) + ': ' + str(np.mean(J))) np.save('ship_steering.npy', dataset_eval)
def build_agent_high(alg, params, std, mdp): # Features approximator1 = Regressor(LinearApproximator, input_shape=(1, ), output_shape=(1, )) # Policy H n_weights = approximator1.weights_size mu = np.zeros(n_weights) sigma = std * np.ones(n_weights) pi = DeterministicPolicy(approximator1) dist = GaussianDiagonalDistribution(mu, sigma) lim = np.pi / 2 low = mdp.info.observation_space.low[0:1] high = mdp.info.observation_space.high[0:1] mdp_info = MDPInfo(observation_space=spaces.Box(low, high), action_space=spaces.Box(-lim, lim, (1, )), gamma=mdp.info.gamma, horizon=mdp.info.horizon) return alg(dist, pi, mdp_info, **params)
def build_agent_low(alg, params, std, mdp): approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(1, )) n_weights = approximator.weights_size mu = np.zeros(n_weights) sigma = std * np.ones(n_weights) pi = DeterministicControlPolicy(approximator) dist = GaussianDiagonalDistribution(mu, sigma) # Agent Low mdp_info = MDPInfo( observation_space=spaces.Box( low=mdp.info.observation_space.low[1:], # FIXME FALSE high=mdp.info.observation_space.high[1:], # FIXME FALSE ), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=mdp.info.horizon) return alg(dist, pi, mdp_info, **params)
def build_low_level_agent(alg, params, mdp, horizon, std): rho_max = np.linalg.norm(mdp.info.observation_space.high[:2] - mdp.info.observation_space.low[:2]) low = np.array([-np.pi, 0]) high = np.array([np.pi, rho_max]) basis = FourierBasis.generate(low, high, 10) features = Features(basis_list=basis) approximator = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=mdp.info.action_space.shape) pi = DiagonalGaussianPolicy(approximator, std) mdp_info_agent = MDPInfo(observation_space=spaces.Box(low, high), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=horizon) agent = alg(pi, mdp_info_agent, features=features, **params) return agent
# Environment mdp = TurtlebotGazebo() # Policy tensor_list = gaussian_tensor.generate( [10, 10, 6], [[-5.0, 5.0], [-5.0, 5.0], [-np.pi, np.pi]]) phi = Features(tensor_list=tensor_list, name='phi', input_dim=mdp.info.observation_space.shape[0]) input_shape = (phi.size, ) approximator_params = dict(input_dim=phi.size) approximator = Regressor(LinearApproximator, input_shape=input_shape, output_shape=mdp.info.action_space.shape, params=approximator_params) sigma = np.eye(2) * 1e-1 policy = MultivariateGaussianPolicy(mu=approximator, sigma=sigma) # Agent learning_rate = AdaptiveParameter(value=5) algorithm_params = dict(learning_rate=learning_rate) fit_params = dict() agent_params = {'algorithm_params': algorithm_params, 'fit_params': fit_params} agent = REINFORCE(policy, mdp.info, agent_params, phi) # Train core = Core(agent, mdp) print 'Initial evaluation'
def server_experiment_small(alg_high, alg_low, params, subdir, i): np.random.seed() # Model Block mdp = ShipSteering(small=False, n_steps_action=3) #State Placeholder state_ph = PlaceHolder(name='state_ph') #Reward Placeholder reward_ph = PlaceHolder(name='reward_ph') #Last_In Placeholder lastaction_ph = PlaceHolder(name='lastaction_ph') # Function Block 1 function_block1 = fBlock(name='f1 (angle difference)', phi=pos_ref_angle_difference) # Function Block 2 function_block2 = fBlock(name='f2 (cost cosine)', phi=cost_cosine) #Features features = Features(basis_list=[PolynomialBasis()]) # Policy 1 sigma1 = np.array([255, 255]) approximator1 = Regressor(LinearApproximator, input_shape=(features.size, ), output_shape=(2, )) approximator1.set_weights(np.array([500, 500])) pi1 = DiagonalGaussianPolicy(mu=approximator1, std=sigma1) # Policy 2 pi2 = DeterministicControlPolicy(weights=np.array([0])) mu2 = np.zeros(pi2.weights_size) sigma2 = 1e-3 * np.ones(pi2.weights_size) distribution2 = GaussianDiagonalDistribution(mu2, sigma2) # Agent 1 learning_rate1 = params.get('learning_rate_high') lim = 1000 mdp_info_agent1 = MDPInfo(observation_space=mdp.info.observation_space, action_space=spaces.Box(0, lim, (2, )), gamma=mdp.info.gamma, horizon=100) agent1 = alg_high(policy=pi1, mdp_info=mdp_info_agent1, learning_rate=learning_rate1, features=features) # Agent 2 learning_rate2 = params.get('learning_rate_low') mdp_info_agent2 = MDPInfo(observation_space=spaces.Box( -np.pi, np.pi, (1, )), action_space=mdp.info.action_space, gamma=mdp.info.gamma, horizon=100) agent2 = alg_low(distribution=distribution2, policy=pi2, mdp_info=mdp_info_agent2, learning_rate=learning_rate2) # Control Block 1 parameter_callback1 = CollectPolicyParameter(pi1) control_block1 = ControlBlock(name='Control Block 1', agent=agent1, n_eps_per_fit=ep_per_run, callbacks=[parameter_callback1]) # Control Block 2 parameter_callback2 = CollectDistributionParameter(distribution2) control_block2 = ControlBlock(name='Control Block 2', agent=agent2, n_eps_per_fit=10, callbacks=[parameter_callback2]) #Reward Accumulator reward_acc = reward_accumulator_block(gamma=mdp_info_agent1.gamma, name='reward_acc') # Algorithm blocks = [ state_ph, reward_ph, lastaction_ph, control_block1, control_block2, function_block1, function_block2, reward_acc ] state_ph.add_input(control_block2) reward_ph.add_input(control_block2) lastaction_ph.add_input(control_block2) control_block1.add_input(state_ph) reward_acc.add_input(reward_ph) reward_acc.add_alarm_connection(control_block2) control_block1.add_reward(reward_acc) control_block1.add_alarm_connection(control_block2) function_block1.add_input(control_block1) function_block1.add_input(state_ph) function_block2.add_input(function_block1) control_block2.add_input(function_block1) control_block2.add_reward(function_block2) computational_graph = ComputationalGraph(blocks=blocks, model=mdp) core = HierarchicalCore(computational_graph) # Train low_level_dataset_eval = list() dataset_eval = list() dataset_eval_run = core.evaluate(n_episodes=eval_run) J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at start : ' + str(np.mean(J))) dataset_eval += dataset_eval_run for n in range(n_runs): print('ITERATION', n) core.learn(n_episodes=n_iterations * ep_per_run, skip=True) dataset_eval_run = core.evaluate(n_episodes=eval_run) dataset_eval += dataset_eval_run J = compute_J(dataset_eval_run, gamma=mdp.info.gamma) print('J at iteration ' + str(n) + ': ' + str(np.mean(J))) low_level_dataset_eval += control_block2.dataset.get() # Save parameter_dataset1 = parameter_callback1.get_values() parameter_dataset2 = parameter_callback2.get_values() mk_dir_recursive('./' + subdir + str(i)) np.save(subdir + str(i) + '/low_level_dataset_file', low_level_dataset_eval) np.save(subdir + str(i) + '/parameter_dataset1_file', parameter_dataset1) np.save(subdir + str(i) + '/parameter_dataset2_file', parameter_dataset2) np.save(subdir + str(i) + '/dataset_eval_file', dataset_eval)
def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info)
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0