def main(args): # environment env = MuJoCoWrapper(gym.make(args.env), args.reward_scale, args.render) env.seed(args.seed) eval_env = MuJoCoWrapper(gym.make(args.env)) eval_env.seed(args.seed) num_actions = env.action_space.shape[0] # network parameters params = TD3NetworkParams(fcs=args.layers, concat_index=args.concat_index, state_shape=env.observation_space.shape, num_actions=num_actions, gamma=args.gamma, tau=args.tau, actor_lr=args.actor_lr, critic_lr=args.critic_lr, target_noise_sigma=args.target_noise_sigma, target_noise_clip=args.target_noise_clip) # deep neural network network = TD3Network(params) # replay buffer buffer = Buffer(args.buffer_size) # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # exploration noise noise = NormalActionNoise(np.zeros(num_actions), np.ones(num_actions) * 0.1) # controller controller = TD3Controller(network, buffer, metrics, noise, num_actions, args.batch_size, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episode) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view)
def main(args): # environments env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) env.seed(args.seed) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) eval_env.seed(args.seed) num_actions = env.action_space.shape[0] # network parameters params = PPONetworkParams(fcs=args.layers, num_actions=num_actions, state_shape=env.observation_space.shape, num_envs=args.num_envs, batch_size=args.batch_size, epsilon=args.epsilon, learning_rate=args.lr, grad_clip=args.grad_clip, value_factor=args.value_factor, entropy_factor=args.entropy_factor) # deep neural network network = PPONetwork(params) # rollout buffer rollout = Rollout() # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # controller controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view, batch=True)
def test_stop_episode(self): controller = DummyController() view = View(controller) controller.stop_episode = MagicMock() self.assertEqual(view.stop_episode('obs', 'reward', 'info'), None) controller.stop_episode.assert_called_once_with( 'obs', 'reward', 'info')
def test_should_eval(self): controller = DummyController() view = View(controller) controller.should_eval = MagicMock(return_value=False) assert not view.should_eval() controller.should_eval = MagicMock(return_value=True) assert view.should_eval()
def test_step_without_update(self): controller = DummyController() view = View(controller) controller.should_update = MagicMock(return_value=False) controller.update = MagicMock(side_effect=Exception) controller.step = MagicMock(return_value='action') self.assertEqual(view.step('obs', 'reward', 'done', 'info'), 'action') controller.step.assert_called_once_with('obs', 'reward', 'done', 'info')
def test_step_without_save(self): controller = DummyController() view = View(controller) controller.should_save = MagicMock(return_value=True) controller.save = MagicMock(unsafe=True) controller.step = MagicMock(return_value='action') self.assertEqual(view.step('obs', 'reward', 'done', 'info'), 'action') controller.step.assert_called_once_with('obs', 'reward', 'done', 'info') controller.save.assert_called_once_with()
def test_is_finished(self): controller = DummyController() view = View(controller) controller.save = MagicMock() controller.is_finished = MagicMock(return_value=False) assert not view.is_finished() controller.save.assert_not_called() controller.is_finished = MagicMock(return_value=True) assert view.is_finished() controller.save.assert_called_once_with()
def main(args): # environment env = MuJoCoWrapper(gym.make(args.env), args.reward_scale, args.render) eval_env = MuJoCoWrapper(gym.make(args.env)) num_actions = env.action_space.shape[0] # deep neural network network = SACNetwork(args.layers, args.concat_index, env.observation_space.shape, num_actions, args.gamma, args.tau, args.pi_lr, args.q_lr, args.v_lr, args.reg) # replay buffer buffer = Buffer(args.buffer_size) # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # exploration noise noise = EmptyNoise() # controller controller = SACController(network, buffer, metrics, noise, num_actions, args.batch_size, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episode) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view)
def main(args): env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) num_actions = env.action_space.shape[0] network = PPONetwork(args.layers, env.observation_space.shape, args.num_envs, num_actions, args.batch_size, args.epsilon, args.lr, args.grad_clip, args.value_factor, args.entropy_factor) rollout = Rollout() saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) view = View(controller) eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) batch_interact(env, view, eval_env, eval_view)
def __init__(self): self.controller = Controller() self.models = Models(self.controller) self.view = View(self.models)