def test_reset_3(self): import rewardpredictive as rp import rlutils as rl import numpy as np agent = rp.utils.SFLearning(num_states=14, num_actions=3, learning_rate_sf=0.1, learning_rate_reward=0.2, gamma=0.9, init_sf_mat=np.eye(3 * 14, dtype=np.float32), init_w_vec=np.zeros(3 * 14, dtype=np.float32)) self.assertEqual(agent.get_error_avg(), 0.) for _ in range(10): agent.update_transition(rl.one_hot(0, 14), 0, 0, rl.one_hot(0, 14), False, {}) agent.reset(reset_sf=True, reset_w=True) self.assertTrue( np.all(agent.get_sf_matrix() == np.eye(3 * 14, dtype=np.float32))) self.assertTrue( np.all(agent.get_w_vector() == np.zeros(3 * 14, dtype=np.float32)))
def test_get_error_avg(self): import rewardpredictive as rp import rlutils as rl import numpy as np agent = rp.utils.SFLearning(num_states=14, num_actions=3, learning_rate_sf=0.1, learning_rate_reward=0.2, gamma=0.9, init_sf_mat=np.eye(3 * 14, dtype=np.float32), init_w_vec=np.zeros(3 * 14, dtype=np.float32)) self.assertEqual(agent.get_error_avg(), 0.) s = rl.one_hot(0, 14) err_dict_list = [ agent.update_transition(s, 0, 0, s, False, {}) for _ in range(10) ] err_avg = np.mean([ np.linalg.norm(d['sf_error']) + abs(d['r_error']) for d in err_dict_list ]) self.assertEqual(agent.get_error_avg(), err_avg)
def test(self): import rewardpredictive as rp import numpy as np import rlutils as rl from rlutils.environment.gridworld import pt_to_idx mdp = rp.mdp.GridWord3x3WithGoalsAndWalls([0, 1], 0, slip_prob=0.) t_mat, r_mat = mdp.get_t_mat_r_mat() left = rl.environment.gridworld.GridWorldAction.LEFT right = rl.environment.gridworld.GridWorldAction.RIGHT up = rl.environment.gridworld.GridWorldAction.UP down = rl.environment.gridworld.GridWorldAction.DOWN pt_to_state = lambda x, y: pt_to_idx((x, y), (3, 3)) next_state = lambda x, y, a: np.where( np.matmul(rl.one_hot(pt_to_state(x, y), 9), t_mat[a]))[0][0] self.assertEqual(next_state(0, 0, left), pt_to_state(0, 0)) self.assertEqual(next_state(0, 1, left), pt_to_state(0, 1)) self.assertEqual(next_state(0, 2, left), pt_to_state(0, 2)) self.assertEqual(next_state(1, 0, left), pt_to_state(1, 0)) self.assertEqual(next_state(1, 1, left), pt_to_state(1, 1)) self.assertEqual(next_state(1, 2, left), pt_to_state(0, 2)) self.assertEqual(next_state(2, 0, left), pt_to_state(1, 0)) self.assertEqual(next_state(2, 1, left), pt_to_state(1, 1)) self.assertEqual(next_state(2, 2, left), pt_to_state(1, 2)) self.assertEqual(next_state(0, 0, right), pt_to_state(0, 0)) self.assertEqual(next_state(0, 1, right), pt_to_state(0, 1)) self.assertEqual(next_state(0, 2, right), pt_to_state(1, 2)) self.assertEqual(next_state(1, 0, right), pt_to_state(2, 0)) self.assertEqual(next_state(1, 1, right), pt_to_state(2, 1)) self.assertEqual(next_state(1, 2, right), pt_to_state(2, 2)) self.assertEqual(next_state(2, 0, right), pt_to_state(2, 0)) self.assertEqual(next_state(2, 1, right), pt_to_state(2, 1)) self.assertEqual(next_state(2, 2, right), pt_to_state(2, 2))
def test_score_partial_model_small_3(self): import rewardpredictive as rp import rlutils as rl table_model = rp.utils.TableModel(num_states=4, num_actions=2) table_model.update_transition(rl.one_hot(0, 4), 0, 0, rl.one_hot(2, 4), False, {}) table_model.update_transition(rl.one_hot(0, 4), 1, 0, rl.one_hot(0, 4), False, {}) table_model.update_transition(rl.one_hot(2, 4), 0, 1, rl.one_hot(2, 4), False, {}) table_model.update_transition(rl.one_hot(2, 4), 1, 1, rl.one_hot(2, 4), False, {}) table_model.update_transition(rl.one_hot(1, 4), 1, 0, rl.one_hot(1, 4), False, {}) mdp, phi_mat = self._get_four_state_task() t_mat, r_vec = table_model.get_t_mat_r_vec() score = rp.reward_maximizing.reward_maximizing_score( phi_mat, t_mat, r_vec, table_model.visitation_counts(), gamma=0.9 ) self.assertLessEqual(-1e-5, score)
def test_convergence(self): import rewardpredictive as rp import rlutils as rl import numpy as np from itertools import product mdp = rp.mdp.ColumnWorld2(slip_prob=0.) t_mat, r_vec = mdp.get_t_mat_r_vec() q_star, _ = rl.algorithm.vi(t_mat, r_vec, gamma=0.9) pi_star = np.argmax(q_star, axis=0) num_act = mdp.num_actions() num_states = mdp.num_states() p_mat = np.zeros([num_act * num_states, num_act * num_states], dtype=np.float32) for a, s in product(range(num_act), range(num_states)): sn = np.where(t_mat[a, s] == 1)[0][0] an = pi_star[sn] p_mat[num_states * a + s] = rl.one_hot(num_states * an + sn, num_states * num_act) psi_mat = np.linalg.pinv(np.eye(num_act * num_states) - 0.9 * p_mat) w_vec = np.reshape(r_vec, -1) q_flat = np.matmul(psi_mat, w_vec) self.assertLessEqual( np.linalg.norm(q_flat - np.reshape(q_star, -1), ord=np.inf), 1e-4) agent = rp.utils.SFLearning(num_states=num_states, num_actions=num_act, learning_rate_sf=0.1, learning_rate_reward=0.2, gamma=0.9, init_sf_mat=psi_mat, init_w_vec=w_vec) self.assertLessEqual(np.max(np.abs(agent.get_q_vector() - q_star)), 1e-4) s = rl.one_hot(0, num_states) sn = np.matmul(s, t_mat[0]) for _ in range(10): agent.update_transition(s, 0, r_vec[0, 0], sn, False, {}) agent.on_simulation_timeout() self.assertLessEqual(np.max(np.abs(agent.get_q_vector() - q_star)), 1e-4)
def test_update_transition(self): import rewardpredictive as rp import rlutils as rl import numpy as np table_model = rp.utils.TableModel(3, 2, max_reward=1) t_mat, r_vec = table_model.get_t_mat_r_vec() self.assertTrue(np.all(t_mat[0] == np.eye(3, dtype=np.float32))) self.assertTrue(np.all(t_mat[1] == np.eye(3, dtype=np.float32))) self.assertTrue(np.all(r_vec[0] == np.ones([2, 3], dtype=np.float32))) self.assertTrue(np.all(r_vec[1] == np.ones([2, 3], dtype=np.float32))) self.assertTrue(np.all(table_model.visitation_counts() == 0)) table_model.update_transition(rl.one_hot(0, 3), 0, .1, rl.one_hot(0, 3), False, {}) table_model.update_transition(rl.one_hot(0, 3), 0, 0, rl.one_hot(1, 3), False, {}) table_model.on_simulation_timeout() table_model.update_transition(rl.one_hot(1, 3), 1, .5, rl.one_hot(2, 3), True, {}) t_mat, r_vec = table_model.get_t_mat_r_vec() t_mat_corr = np.array([[[.5, .5, 0.], [0., 1., 0.], [0., 0., 1.]], [[1., 0., 0.], [0., 0., 1.], [0., 0., 1.]]], dtype=np.float32) self.assertTrue(np.all(t_mat == t_mat_corr)) r_vec_corr = np.array([[.05, 1., 1.], [1., 0.5, 1.]], dtype=np.float32) self.assertTrue(np.all(r_vec == r_vec_corr)) visitation_counts_corr = np.array([[2, 0, 0], [0, 1, 0]], dtype=np.int) self.assertTrue( np.all(table_model.visitation_counts() == visitation_counts_corr))
def get_t_mat_r_vec(self): num_actions, num_states = np.shape(self._v_cnt) t_mat = np.zeros(np.shape(self._t_cnt), dtype=np.float32) r_vec = np.zeros(np.shape(self._r_sum), dtype=np.float32) for a, i in product(range(num_actions), range(num_states)): if self._v_cnt[a, i] == 0: t_mat[a, i] = rl.one_hot(i, num_states) r_vec[a, i] = self._r_default[a, i] else: t_mat[a, i] = self._t_cnt[a, i] / self._v_cnt[a, i] r_vec[a, i] = self._r_sum[a, i] / self._v_cnt[a, i] return t_mat, r_vec
def _get_best_total_reward_at_transfer(total_reward, scores, frac=0.05): num_abs, num_task = np.shape(total_reward) best_num = int(np.ceil(num_abs * frac)) total_reward_transfer = [] for task_idx in range(num_task): best_part = np.argsort(scores[:, task_idx])[:best_num] total_reward_all = np.array([total_reward[i] for i in best_part]) bit_mask = np.reshape(1. - rl.one_hot(task_idx, num_task), [1, -1]) bit_mask = bit_mask / np.sum(bit_mask, axis=-1, keepdims=True) total_reward_transfer_from_task = np.sum(total_reward_all * bit_mask, axis=-1) total_reward_transfer.append(total_reward_transfer_from_task) total_reward_transfer = np.stack(total_reward_transfer) rand_task_idx = np.random.randint(0, num_task, size=best_num) total_reward_transfer = np.stack([total_reward_transfer[j, i] for i, j in enumerate(rand_task_idx)]) return total_reward_transfer
def test_lam_partial_model(self): import rewardpredictive as rp import rlutils as rl import numpy as np table_model = rp.utils.TableModel(num_states=4, num_actions=2) table_model.update_transition(rl.one_hot(0, 4), 0, 0, rl.one_hot(2, 4), False, {}) table_model.update_transition(rl.one_hot(0, 4), 1, 0, rl.one_hot(0, 4), False, {}) table_model.update_transition(rl.one_hot(2, 4), 0, 1, rl.one_hot(2, 4), False, {}) table_model.update_transition(rl.one_hot(2, 4), 1, 1, rl.one_hot(2, 4), False, {}) task, phi_mat = self._get_four_state_task() t_mat, r_vec = table_model.get_t_mat_r_vec() m_mat, w_vec = rp.utils.lam_from_mat_visitation_counts( t_mat, r_vec, phi_mat, table_model.visitation_counts()) m_mat_test = np.array([[[0., 1.], [0., 1.]], [[1., 0.], [0., 1.]]], dtype=np.float32) w_vec_test = np.array([[0., 1.], [0., 1.]], dtype=np.float32) self.assertTrue(np.all(m_mat == m_mat_test)) self.assertTrue(np.all(w_vec == w_vec_test))
def eval_reward_predictive(task, partition, repeats=5, rollout_depth=10): phi_mat = cluster_idx_to_phi_mat(partition) t_mat, r_vec = task.get_t_mat_r_vec() m_mat, w_vec = lam_from_mat(t_mat, r_vec, phi_mat) action_seq_list = np.random.randint(0, task.num_actions(), size=[repeats, rollout_depth]) start_state_list = [ rl.one_hot(i, task.num_states()) for i in np.random.randint(0, task.num_states(), size=repeats) ] start_state_list = np.stack(start_state_list).astype(dtype=np.float32) rew_err = [] for i in range(repeats): s = start_state_list[i] action_seq = action_seq_list[i] rew_original = reward_rollout(t_mat, r_vec, s, action_seq) rew_latent = reward_rollout(m_mat, w_vec, np.matmul(s, phi_mat), action_seq) rew_err.append(np.abs(rew_original - rew_latent)) return np.array(rew_err, dtype=np.float32)
def test_reset(self): import rewardpredictive as rp import rlutils as rl import numpy as np table_model = rp.utils.TableModel(3, 2, max_reward=1) table_model.update_transition(rl.one_hot(0, 3), 0, .1, rl.one_hot(0, 3), False, {}) table_model.update_transition(rl.one_hot(0, 3), 0, 0, rl.one_hot(1, 3), False, {}) table_model.on_simulation_timeout() table_model.update_transition(rl.one_hot(1, 3), 1, .5, rl.one_hot(2, 3), True, {}) table_model.reset() t_mat, r_vec = table_model.get_t_mat_r_vec() self.assertTrue(np.all(t_mat[0] == np.eye(3, dtype=np.float32))) self.assertTrue(np.all(t_mat[1] == np.eye(3, dtype=np.float32))) self.assertTrue(np.all(r_vec[0] == np.ones([2, 3], dtype=np.float32))) self.assertTrue(np.all(r_vec[1] == np.ones([2, 3], dtype=np.float32))) self.assertTrue(np.all(table_model.visitation_counts() == 0))