def test_add_with_dimension_mismatch(self): with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['action_t'] = np.random.random((5, 4)) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['reward_t'] = np.random.random((5, )) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['value_t'] = np.random.random((5, )) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['log_prob_t'] = np.random.random((5, )) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['terminal_t'] = np.random.random((5, )) insert_inputs_to_rollout(inputs, rollout)
def setup_method(self): self.network = DummyNetwork() self.rollout = Rollout() self.metrics = DummyMetrics() self.controller = PPOController( self.network, self.rollout, self.metrics, num_envs=4, time_horizon=128, epoch=4, batch_size=32, gamma=0.99, lam=0.9)
def test_size(self): rollout = Rollout() self.assertEqual(rollout.size(), 0) inputs = make_inputs() insert_inputs_to_rollout(inputs, rollout) self.assertEqual(rollout.size(), 1) insert_inputs_to_rollout(inputs, rollout) self.assertEqual(rollout.size(), 2)
def main(args): # environments env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) env.seed(args.seed) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) eval_env.seed(args.seed) num_actions = env.action_space.shape[0] # network parameters params = PPONetworkParams(fcs=args.layers, num_actions=num_actions, state_shape=env.observation_space.shape, num_envs=args.num_envs, batch_size=args.batch_size, epsilon=args.epsilon, learning_rate=args.lr, grad_clip=args.grad_clip, value_factor=args.value_factor, entropy_factor=args.entropy_factor) # deep neural network network = PPONetwork(params) # rollout buffer rollout = Rollout() # metrics saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) # controller controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) # view view = View(controller) # evaluation eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) interact(env, view, eval_env, eval_view, batch=True)
def test_fetch(self): gamma = np.random.random() lam = np.random.random() rollout = Rollout() inputs1 = make_inputs() insert_inputs_to_rollout(inputs1, rollout) with pytest.raises(AssertionError): rollout.fetch(gamma, lam) inputs2 = make_inputs() insert_inputs_to_rollout(inputs2, rollout) trajectory = rollout.fetch(gamma, lam) assert np.all(inputs1['obs_t'] == trajectory['obs_t'][0]) assert np.all(inputs1['action_t'] == trajectory['actions_t'][0]) assert np.all(inputs1['log_prob_t'] == trajectory['log_probs_t'][0]) assert trajectory['returns_t'].shape == (1, 4) assert trajectory['advantages_t'].shape == (1, 4)
def test_add_success(self): rollout = Rollout() inputs1 = make_inputs() insert_inputs_to_rollout(inputs1, rollout) assert_inputs_with_rollout(inputs1, rollout, 0) inputs2 = make_inputs() insert_inputs_to_rollout(inputs2, rollout) assert_inputs_with_rollout(inputs1, rollout, 0) assert_inputs_with_rollout(inputs2, rollout, 1)
def test_add_with_shape_error(self): with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['reward_t'] = np.random.random((4, 5)) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['value_t'] = np.random.random((4, 5)) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['log_prob_t'] = np.random.random((4, 5)) insert_inputs_to_rollout(inputs, rollout) with pytest.raises(AssertionError): rollout = Rollout() inputs = make_inputs() inputs['terminal_t'] = np.random.random((4, 5)) insert_inputs_to_rollout(inputs, rollout)
def main(args): env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale), args.render) eval_env = BatchEnvWrapper( make_envs(args.env, args.num_envs, args.reward_scale)) num_actions = env.action_space.shape[0] network = PPONetwork(args.layers, env.observation_space.shape, args.num_envs, num_actions, args.batch_size, args.epsilon, args.lr, args.grad_clip, args.value_factor, args.entropy_factor) rollout = Rollout() saver = tf.train.Saver() metrics = Metrics(args.name, args.log_adapter, saver) controller = PPOController(network, rollout, metrics, args.num_envs, args.time_horizon, args.epoch, args.batch_size, args.gamma, args.lam, args.final_steps, args.log_interval, args.save_interval, args.eval_interval) view = View(controller) eval_controller = EvalController(network, metrics, args.eval_episodes) eval_view = View(eval_controller) # save hyperparameters metrics.log_parameters(vars(args)) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # save model graph for debugging metrics.set_model_graph(sess.graph) if args.load is not None: saver.restore(sess, args.load) batch_interact(env, view, eval_env, eval_view)
class TestPPOController: def setup_method(self): self.network = DummyNetwork() self.rollout = Rollout() self.metrics = DummyMetrics() self.controller = PPOController( self.network, self.rollout, self.metrics, num_envs=4, time_horizon=128, epoch=4, batch_size=32, gamma=0.99, lam=0.9) def test_step(self): output = make_output(batch_size=4, batch=True) self.network._infer = MagicMock(return_value=output) self.network._infer_arguments = MagicMock(return_value=['obs_t']) inpt = make_input(batch_size=4, batch=True) action = self.controller.step(*inpt) assert np.all(action == output.action) assert self.rollout.size() == 1 assert np.all(inpt[0] == self.rollout.obs_t[0]) assert np.all(inpt[1] == self.rollout.rewards_t[0]) assert np.all(inpt[2] == self.rollout.terminals_t[0]) assert np.all(output.action == self.rollout.actions_t[0]) assert np.all(output.value == self.rollout.values_t[0]) assert np.all(output.log_prob == self.rollout.log_probs_t[0]) def test_should_update(self): output = make_output(batch_size=4, batch=True) self.network._infer = MagicMock(return_value=output) self.network._infer_arguments = MagicMock(return_value=['obs_t']) inpt = make_input(batch_size=4, batch=True) for i in range(128): self.controller.step(*inpt) assert not self.controller.should_update() self.controller.step(*inpt) assert self.controller.should_update() def test_batches(self): output = make_output(batch_size=4, batch=True) self.network._infer_arguments = MagicMock(return_value=['obs_t']) input_history = [] output_history = [] for i in range(129): inpt = make_input(batch_size=4, batch=True) self.network._infer = MagicMock(return_value=output) action = self.controller.step(*inpt) input_history.append(inpt) output_history.append(output) for key in ['obs_t', 'actions_t', 'log_probs_t', 'returns_t', 'advantages_t', 'values_t']: count = 0 for batch in self.controller._batches(): count += 1 assert key in batch assert batch[key].shape[0] == 32 if key == 'obs_t': assert batch[key].shape[1:] == inpt[0].shape[1:] elif key == 'actions_t': assert batch[key].shape[1] == action.shape[1] elif key == 'log_probs_t': assert len(batch[key].shape) == 1 elif key == 'returns_t': assert len(batch[key].shape) == 1 elif key == 'advantages_t': assert len(batch[key].shape) == 1 elif key == 'values_t': assert len(batch[key].shape) == 1 assert count == 128 * 4 // 32 def test_batch_with_short_trajectory_error(self): output = make_output(batch_size=4, batch=True) self.network._infer_arguments = MagicMock(return_value=['obs_t']) self.network._infer = MagicMock(return_value=output) inpt = make_input(batch_size=4, batch=True) action = self.controller.step(*inpt) with pytest.raises(AssertionError): self.controller._batches() def test_update_with_should_update_false(self): inpt = make_input(batch_size=4, batch=True) output = make_output(batch_size=4, batch=True) self.network._infer = MagicMock(return_value=output) self.network._infer_arguments = MagicMock(return_value=['obs_t']) for i in range(20): action = self.controller.step(*inpt) with pytest.raises(AssertionError): self.controller.update() def test_update_success(self): inpt = make_input(batch_size=4, batch=True) output = make_output(batch_size=4, batch=True) loss = np.random.random() self.network._infer = MagicMock(return_value=output) self.network._infer_arguments = MagicMock(return_value=['obs_t']) self.network._update_arguments = MagicMock(return_value=['obs_t', 'actions_t', 'returns_t', 'advantages_t', 'log_probs_t']) self.network._update = MagicMock(return_value=loss) for i in range(129): action = self.controller.step(*inpt) assert np.allclose(self.controller.update(), loss) assert self.rollout.size() == 0 assert self.network._update.call_count == 128 * 4 * 4 // 32