def testCombineTransitions(self): states, actions, rewards, new_states, reward_mask = ( base.Brain.CombineTransitions([ base.Transition( s=numpy.array([[1, 2, 3]]), a=numpy.array([[0, 1, 0]]), r=1.0, sp=numpy.array([[4, 5, 6]]), ), base.Transition( s=numpy.array([[4, 5, 6]]), a=numpy.array([[0, 0, 1]]), r=-1.0, sp=None, ), ])) numpy_util.TestUtil.AssertArrayEqual( numpy.array([[1, 2, 3], [4, 5, 6]]), states) numpy_util.TestUtil.AssertArrayEqual( numpy.array([[0, 1, 0], [0, 0, 1]]), actions) numpy_util.TestUtil.AssertArrayEqual( numpy.array([1.0, -1.0]), rewards) numpy_util.TestUtil.AssertArrayEqual( numpy.array([[4, 5, 6], [4, 5, 6]]), new_states) numpy_util.TestUtil.AssertArrayEqual( numpy.array([1, 0]), reward_mask)
def test_convergence(self): a3c = a3c_impl.A3C( model=a3c_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), # optimizer=a3c_impl.CreateDefaultOptimizer(learning_rate=0.05), ) s = numpy.array([[1, 2, 3]]) a1 = numpy.array([[1, 0]]) a2 = numpy.array([[0, 1]]) for _ in range(10): # Needs to train for both actions as one step, otherwise it shows some # "staggering" effect. a3c.UpdateFromTransitions([ base.Transition(s=s, a=a1, r=1.0, sp=None), ]) a3c.UpdateFromTransitions([ base.Transition(s=s, a=a2, r=-1.0, sp=s), ]) logging.printf('%s', a3c.GetValues(s)) old_value_a1 = a3c.GetActionValues(a3c.GetValues(s), a1) # Trains for one step, for both actions. a3c.UpdateFromTransitions([ base.Transition(s=s, a=a1, r=1.0, sp=None), ]) a3c.UpdateFromTransitions([ base.Transition(s=s, a=a2, r=-1.0, sp=s), ]) self.assertGreaterEqual(a3c.GetActionValues(a3c.GetValues(s), a1), old_value_a1)
def test_runUsesNewStateAfterIteration(self): self.env.TakeAction.side_effect = [ base.Transition( s=numpy.array([[0]]), a=numpy.array([[0]]), r=1.0, sp=numpy.array([[1]]), ), base.Transition( s=numpy.array([[1]]), a=numpy.array([[0]]), r=1.0, sp=None, ) ] self.runner.Run( env=self.env, brain=self.qfunc, policy=self.policy, num_of_episodes=1, ) # Tests that the second call is with the new state 1. self.policy.Decide.assert_called_with( env=mock.ANY, brain=mock.ANY, state=numpy.array([[1]]), episode_idx=0, num_of_episodes=1)
def test_UpdateValues_singleTransition(self): self.qfunc._protected_SetValues( numpy.array([ [1, 2, 3], [4, 5, 6], [2, 2, 2], ]), numpy.array([ [0.5, 0.5], [0.3, 0.7], [0.8, 0.9], ])) self.qfunc.UpdateFromTransitions([base.Transition( s=numpy.array([[1, 2, 3]]), a=numpy.array([[0, 1]]), r=1.0, sp=numpy.array([[2, 2, 2]]), )]) # The new values for state (1,2,3) should be: # - action (1,0): 0.5, since it's not changed. # - action (0,1): max(0.8, 0.9) * 0.5 + 1.0 = 1.45 numpy_util.TestUtil.AssertArrayEqual( numpy.array([[0.5, 1.45]]), self.qfunc.GetValues(numpy.array([[1, 2, 3]])))
def test_learningRate(self): # Disables learning from Q* to simplifies testing. qfunc = qfunc_impl.MemoizationQFunction( action_space_size=2, discount_factor=0.0, learning_rate=0.9, ) qfunc._protected_SetValues( numpy.array([ [1, 2, 3], [4, 5, 6], ]), numpy.array([ [0.5, 0.6], [0.3, 0.7], ])) qfunc.UpdateFromTransitions([base.Transition( s=numpy.array([[1, 2, 3]]), a=numpy.array([[0, 1]]), r=1.0, sp=numpy.array([[2, 2, 2]]), )]) # The new values for state (1,2,3) should be: # - action (1,0): 0.5, since it's not changed. # - action (0,1): (1-0.9) * 0.6 + 0.9 * 1.0 = 0.96. numpy_util.TestUtil.AssertArrayEqual( numpy.array([[0.5, 0.96]]), qfunc.GetValues(numpy.array([[1, 2, 3]])))
def TakeAction(self, action: base.Action) -> base.Transition: current_state = self._current_state move = self.GetChoiceFromAction(action) - 1 # -1, 0, 1 new_state = current_state + move r = None if move == 0: r = 0 else: if move == 1 and current_state < 0: r = 1 elif move == -1 and current_state > 0: r = 1 else: r = -1 if new_state > self._size: new_state = -self._size elif new_state < -self._size: new_state = self._size s = numpy.array([[current_state]]) a = action if self._num_actions_taken >= STEP_LIMIT: sp = None else: sp = numpy.array([[new_state]]) self._current_state = new_state self._num_actions_taken += 1 return base.Transition(s, a, r, sp)
def test_saveLoad(self): a3c = a3c_impl.A3C(model=a3c_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), ) tmp_file = tempfile.NamedTemporaryFile().name s = numpy.array([[1, 2, 3]]) for _ in range(10): a3c.UpdateFromTransitions([ base.Transition(s=s, a=numpy.array([[1, 0]]), r=1.0, sp=numpy.array([[4, 5, 6]])), ]) a3c.Save(tmp_file) saved_values = a3c.GetValues(s) a3c = a3c_impl.A3C(model=a3c_impl.CreateModel( state_shape=(3, ), action_space_size=2, hidden_layer_sizes=(3, ), ), ) a3c.Load(tmp_file) numpy_util.TestUtil.AssertArrayEqual(saved_values, a3c.GetValues(s))
def train_push(self, s, a, r, s_): return self._brain.UpdateFromTransitions([ base.Transition(s=numpy.array([s]), a=numpy.array([a]), r=r, sp=numpy.array([s_])) ])
def setUp(self) -> None: self.brain = mock.MagicMock() self.runner = runner_impl.NStepExperienceRunner( discount_factor=0.5, n_step_return=5, ) self.tran = base.Transition( s=numpy.array([[0]]), a=numpy.array([[1]]), r=1.0, sp=numpy.array([[0]]), )
def test_memoryManagement(self): qfunc = qfunc_impl.RandomQFunction(action_space_size=2) runner = runner_impl.ExperienceReplayRunner( experience_capacity=1, experience_sample_batch_size=1, train_every_n_steps=1) tran1 = base.Transition(s=numpy.array([[1, 2]]), a=numpy.array([[1, 0]]), r=1, sp=numpy.array([[3, 4]])) tran2 = base.Transition(s=numpy.array([[3, 4]]), a=numpy.array([[0, 1]]), r=1, sp=numpy.array([[5, 6]])) runner._protected_ProcessTransition(qfunc, tran1, 0) runner._protected_ProcessTransition(qfunc, tran2, 1) hist = runner._experience._history self.assertEqual(1, len(hist)) self.assertEqual(tran2, hist[0])
def testCalculateNStepReward_whenDone(self): for _ in range(4): self.runner._protected_ProcessTransition(self.brain, self.tran, 0) self.assertFalse(self.brain.called) tran = base.Transition( s=numpy.array([[0]]), a=numpy.array([[1]]), r=1.0, sp=None, ) self.runner._protected_ProcessTransition(self.brain, tran, 0) rewards = [] for tran in self.brain.UpdateFromTransitions.call_args[0][0]: rewards.append(tran.r) self.assertCountEqual([ 1.0, 1.0 + 0.5, 1.0 + 0.5 + 0.5**2, 1.0 + 0.5 + 0.5**2 + 0.5**3, 1.0 + 0.5 + 0.5**2 + 0.5**3 + 0.5**4 ], rewards)
def _GetNStepTransition(self) -> base.Transition: # This implementation takes 3.542e-06 sec per call. R = 0.0 next_discount_factor = 1.0 for tran in self._memory: R += tran.r * next_discount_factor next_discount_factor *= self._gamma # The commented implementation takes 7.322e-06 sec per call. # rewards = numpy.zeros(self._n_step_return) # for idx, tran in enumerate(self._memory): # rewards[idx] = tran.r # R = numpy.sum(self._gamma_powers * rewards) return base.Transition( s=self._memory[0].s, a=self._memory[0].a, r=R, sp=self._memory[-1].sp, )
def test_UpdateValues_environmentDone(self): self.qfunc._protected_SetValues( numpy.array([ [1, 2, 3], [4, 5, 6], ]), numpy.array([ [0.5, 0.5], [0.3, 0.7], ])) self.qfunc.UpdateFromTransitions([base.Transition( s=numpy.array([[1, 2, 3]]), a=numpy.array([[0, 1]]), r=1.0, sp=None, )]) # The new values for state (1,2,3) should be: # - action (1,0): 0.5, since it's not changed. # - action (0,1): 1.0, since environment is done, only reward is used. numpy_util.TestUtil.AssertArrayEqual( numpy.array([[0.5, 1.0]]), self.qfunc.GetValues(numpy.array([[1, 2, 3]])))