def test_hindsight(self): """Test Hindsight Experience Replay.""" # The environment is a plane in which the agent moves by steps until it reaches a randomly # positioned goal. No reward is given until it reaches the goal. That makes it very hard # to learn by standard methods, since it may take a very long time to receive any feedback # at all. Using hindsight makes it much easier. class TestEnvironment(dc.rl.Environment): def __init__(self): super(TestEnvironment, self).__init__((4, ), 4) self.moves = [(-1, 0), (1, 0), (0, -1), (0, 1)] def reset(self): self._state = np.concatenate([[0, 0], np.random.randint(-50, 50, 2)]) self._terminated = False self.count = 0 def step(self, action): new_state = self._state.copy() new_state[:2] += self.moves[action] self._state = new_state self.count += 1 reward = 0 if np.array_equal(new_state[:2], new_state[2:]): self._terminated = True reward = 1 elif self.count == 1000: self._terminated = True return reward def apply_hindsight(self, states, actions, goal): new_states = [] rewards = [] goal_pos = goal[:2] for state, action in zip(states, actions): new_state = state.copy() new_state[2:] = goal_pos new_states.append(new_state) pos_after_action = new_state[:2] + self.moves[action] if np.array_equal(pos_after_action, goal_pos): rewards.append(1) else: rewards.append(0) return new_states, rewards # A simple policy with two hidden layers. class TestPolicy(dc.rl.Policy): def create_layers(self, state, **kwargs): dense1 = Dense(6, activation_fn=tf.nn.relu, in_layers=state) dense2 = Dense(6, activation_fn=tf.nn.relu, in_layers=dense1) output = Dense(4, activation_fn=tf.nn.softmax, biases_initializer=None, in_layers=dense2) value = Dense(1, in_layers=dense2) return {'action_prob': output, 'value': value} # Optimize it. env = TestEnvironment() learning_rate = PolynomialDecay(initial_rate=0.0001, final_rate=0.00005, decay_steps=1500000) ppo = dc.rl.PPO(env, TestPolicy(), use_hindsight=True, optimization_epochs=8, optimizer=Adam(learning_rate=learning_rate)) ppo.fit(1500000) # Try running it a few times and see if it succeeds. pass_count = 0 for i in range(5): env.reset() while not env.terminated: env.step(ppo.select_action(env.state)) if np.array_equal(env.state[:2], env.state[2:]): pass_count += 1 assert pass_count >= 3
def test_continuous(self): """Test A3C on an environment with a continous action space.""" # The state consists of two numbers: a current value and a target value. # The policy just needs to learn to output the target value (or at least # move toward it). class TestEnvironment(dc.rl.Environment): def __init__(self): super(TestEnvironment, self).__init__((2, ), action_shape=(1, )) def reset(self): target = np.random.uniform(-50, 50) self._state = np.array([0, target]) self._terminated = False self.count = 0 def step(self, action): target = self._state[1] dist = np.abs(target - action[0]) old_dist = np.abs(target - self._state[0]) new_state = np.array([action[0], target]) self._state = new_state self.count += 1 reward = old_dist - dist self._terminated = (self.count == 10) return reward # A simple policy with no hidden layers. class TestPolicy(dc.rl.Policy): def create_layers(self, state, **kwargs): action_mean = Dense(1, in_layers=state, weights_initializer=tf.zeros_initializer) action_std = Constant([10.0]) value = Dense(1, in_layers=state) return { 'action_mean': action_mean, 'action_std': action_std, 'value': value } # Optimize it. env = TestEnvironment() learning_rate = PolynomialDecay(initial_rate=0.005, final_rate=0.0005, decay_steps=25000) a3c = dc.rl.A3C(env, TestPolicy(), discount_factor=0, optimizer=Adam(learning_rate=learning_rate)) a3c.fit(25000) # Try running it and see if it reaches the target env.reset() while not env.terminated: env.step(a3c.select_action(env.state, deterministic=True)) distance = np.abs(env.state[0] - env.state[1]) tolerance = max(1.0, 0.1 * np.abs(env.state[1])) assert distance < tolerance