def test_convergence_sgd_one_freeze(self): freeze_interval = 500 net = q_network.DeepQLearner(self.mdp.num_states, 1, self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, freeze_interval, 1, 'linear', 'sgd', 'sum', 1.0) self.train(net, freeze_interval * 2) numpy.testing.assert_almost_equal( self.all_q_vals(net), [[.7, 0], [.35, .5], [0, 1.0], [0., 0.]], 3)
def test_updates_sgd_no_freeze(self): freeze_interval = -1 net = q_network.DeepQLearner(self.mdp.num_states, 1, self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, freeze_interval, 1, 'linear', 'sgd', 'sum', 1.0) mdp = self.mdp # Depart left: state = mdp.states[0] action_index = 0 reward, next_state, terminal = mdp.act(state, action_index) net.train(state, mdp.actions[action_index], reward, next_state, terminal) numpy.testing.assert_almost_equal(self.all_q_vals(net), [[.07, 0], [0, 0], [0, 0], [0, 0]]) # Depart right: state = mdp.states[-2] action_index = 1 reward, next_state, terminal = mdp.act(state, action_index) net.train(state, mdp.actions[action_index], reward, next_state, terminal) numpy.testing.assert_almost_equal(self.all_q_vals(net), [[.07, 0], [0, 0], [0, .1], [0, 0]]) # Move into leftmost state state = mdp.states[1] action_index = 0 reward, next_state, terminal = mdp.act(state, action_index) net.train(state, mdp.actions[action_index], reward, next_state, terminal) numpy.testing.assert_almost_equal( self.all_q_vals(net), [[.07, 0], [0.0035, 0], [0, .1], [0, 0]])
def test_convergence_random_initialization(self): """ This test will only pass if terminal states are handled correctly. Otherwise the random initialization of the value of the terminal state will propagate back. """ freeze_interval = -1 net = q_network.DeepQLearner(self.mdp.num_states, 1, self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, freeze_interval, 1, 'linear', 'sgd', 'sum', 1.0) # Randomize initial q-values: params = lasagne.layers.helper.get_all_param_values(net.l_out) rand = np.random.random(params[0].shape) rand = numpy.array(rand, dtype=theano.config.floatX) lasagne.layers.helper.set_all_param_values(net.l_out, [rand]) self.train(net, 1000) numpy.testing.assert_almost_equal( self.all_q_vals(net)[0:3, :], [[.7, .25], [.35, .5], [.25, 1.0]], 3)
def make_net(self, freeze_interval): return q_network.DeepQLearner(self.mdp.num_states, 1, self.mdp.num_actions, 1, self.discount, self.learning_rate, 0, 0, 0, 0, freeze_interval, 1, 'linear', 'sgd', 'sum', np.random.RandomState(), 1.0)