Exemple #1
0
    def test_reset_3(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np

        agent = rp.utils.SFLearning(num_states=14,
                                    num_actions=3,
                                    learning_rate_sf=0.1,
                                    learning_rate_reward=0.2,
                                    gamma=0.9,
                                    init_sf_mat=np.eye(3 * 14,
                                                       dtype=np.float32),
                                    init_w_vec=np.zeros(3 * 14,
                                                        dtype=np.float32))
        self.assertEqual(agent.get_error_avg(), 0.)

        for _ in range(10):
            agent.update_transition(rl.one_hot(0, 14), 0, 0, rl.one_hot(0, 14),
                                    False, {})

        agent.reset(reset_sf=True, reset_w=True)
        self.assertTrue(
            np.all(agent.get_sf_matrix() == np.eye(3 * 14, dtype=np.float32)))
        self.assertTrue(
            np.all(agent.get_w_vector() == np.zeros(3 * 14, dtype=np.float32)))
Exemple #2
0
    def test_get_error_avg(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np

        agent = rp.utils.SFLearning(num_states=14,
                                    num_actions=3,
                                    learning_rate_sf=0.1,
                                    learning_rate_reward=0.2,
                                    gamma=0.9,
                                    init_sf_mat=np.eye(3 * 14,
                                                       dtype=np.float32),
                                    init_w_vec=np.zeros(3 * 14,
                                                        dtype=np.float32))
        self.assertEqual(agent.get_error_avg(), 0.)

        s = rl.one_hot(0, 14)
        err_dict_list = [
            agent.update_transition(s, 0, 0, s, False, {}) for _ in range(10)
        ]
        err_avg = np.mean([
            np.linalg.norm(d['sf_error']) + abs(d['r_error'])
            for d in err_dict_list
        ])
        self.assertEqual(agent.get_error_avg(), err_avg)
Exemple #3
0
    def test(self):
        import rewardpredictive as rp
        import numpy as np
        import rlutils as rl
        from rlutils.environment.gridworld import pt_to_idx

        mdp = rp.mdp.GridWord3x3WithGoalsAndWalls([0, 1], 0, slip_prob=0.)
        t_mat, r_mat = mdp.get_t_mat_r_mat()

        left = rl.environment.gridworld.GridWorldAction.LEFT
        right = rl.environment.gridworld.GridWorldAction.RIGHT
        up = rl.environment.gridworld.GridWorldAction.UP
        down = rl.environment.gridworld.GridWorldAction.DOWN
        pt_to_state = lambda x, y: pt_to_idx((x, y), (3, 3))
        next_state = lambda x, y, a: np.where(
            np.matmul(rl.one_hot(pt_to_state(x, y), 9), t_mat[a]))[0][0]

        self.assertEqual(next_state(0, 0, left), pt_to_state(0, 0))
        self.assertEqual(next_state(0, 1, left), pt_to_state(0, 1))
        self.assertEqual(next_state(0, 2, left), pt_to_state(0, 2))
        self.assertEqual(next_state(1, 0, left), pt_to_state(1, 0))
        self.assertEqual(next_state(1, 1, left), pt_to_state(1, 1))
        self.assertEqual(next_state(1, 2, left), pt_to_state(0, 2))
        self.assertEqual(next_state(2, 0, left), pt_to_state(1, 0))
        self.assertEqual(next_state(2, 1, left), pt_to_state(1, 1))
        self.assertEqual(next_state(2, 2, left), pt_to_state(1, 2))
        self.assertEqual(next_state(0, 0, right), pt_to_state(0, 0))
        self.assertEqual(next_state(0, 1, right), pt_to_state(0, 1))
        self.assertEqual(next_state(0, 2, right), pt_to_state(1, 2))
        self.assertEqual(next_state(1, 0, right), pt_to_state(2, 0))
        self.assertEqual(next_state(1, 1, right), pt_to_state(2, 1))
        self.assertEqual(next_state(1, 2, right), pt_to_state(2, 2))
        self.assertEqual(next_state(2, 0, right), pt_to_state(2, 0))
        self.assertEqual(next_state(2, 1, right), pt_to_state(2, 1))
        self.assertEqual(next_state(2, 2, right), pt_to_state(2, 2))
Exemple #4
0
    def test_score_partial_model_small_3(self):
        import rewardpredictive as rp
        import rlutils as rl

        table_model = rp.utils.TableModel(num_states=4, num_actions=2)

        table_model.update_transition(rl.one_hot(0, 4), 0, 0, rl.one_hot(2, 4), False, {})
        table_model.update_transition(rl.one_hot(0, 4), 1, 0, rl.one_hot(0, 4), False, {})
        table_model.update_transition(rl.one_hot(2, 4), 0, 1, rl.one_hot(2, 4), False, {})
        table_model.update_transition(rl.one_hot(2, 4), 1, 1, rl.one_hot(2, 4), False, {})
        table_model.update_transition(rl.one_hot(1, 4), 1, 0, rl.one_hot(1, 4), False, {})

        mdp, phi_mat = self._get_four_state_task()
        t_mat, r_vec = table_model.get_t_mat_r_vec()
        score = rp.reward_maximizing.reward_maximizing_score(
            phi_mat, t_mat, r_vec, table_model.visitation_counts(), gamma=0.9
        )
        self.assertLessEqual(-1e-5, score)
Exemple #5
0
    def test_convergence(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np
        from itertools import product

        mdp = rp.mdp.ColumnWorld2(slip_prob=0.)
        t_mat, r_vec = mdp.get_t_mat_r_vec()
        q_star, _ = rl.algorithm.vi(t_mat, r_vec, gamma=0.9)
        pi_star = np.argmax(q_star, axis=0)

        num_act = mdp.num_actions()
        num_states = mdp.num_states()

        p_mat = np.zeros([num_act * num_states, num_act * num_states],
                         dtype=np.float32)
        for a, s in product(range(num_act), range(num_states)):
            sn = np.where(t_mat[a, s] == 1)[0][0]
            an = pi_star[sn]
            p_mat[num_states * a + s] = rl.one_hot(num_states * an + sn,
                                                   num_states * num_act)
        psi_mat = np.linalg.pinv(np.eye(num_act * num_states) - 0.9 * p_mat)
        w_vec = np.reshape(r_vec, -1)
        q_flat = np.matmul(psi_mat, w_vec)
        self.assertLessEqual(
            np.linalg.norm(q_flat - np.reshape(q_star, -1), ord=np.inf), 1e-4)

        agent = rp.utils.SFLearning(num_states=num_states,
                                    num_actions=num_act,
                                    learning_rate_sf=0.1,
                                    learning_rate_reward=0.2,
                                    gamma=0.9,
                                    init_sf_mat=psi_mat,
                                    init_w_vec=w_vec)
        self.assertLessEqual(np.max(np.abs(agent.get_q_vector() - q_star)),
                             1e-4)
        s = rl.one_hot(0, num_states)
        sn = np.matmul(s, t_mat[0])
        for _ in range(10):
            agent.update_transition(s, 0, r_vec[0, 0], sn, False, {})
        agent.on_simulation_timeout()
        self.assertLessEqual(np.max(np.abs(agent.get_q_vector() - q_star)),
                             1e-4)
Exemple #6
0
    def test_update_transition(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np

        table_model = rp.utils.TableModel(3, 2, max_reward=1)
        t_mat, r_vec = table_model.get_t_mat_r_vec()
        self.assertTrue(np.all(t_mat[0] == np.eye(3, dtype=np.float32)))
        self.assertTrue(np.all(t_mat[1] == np.eye(3, dtype=np.float32)))
        self.assertTrue(np.all(r_vec[0] == np.ones([2, 3], dtype=np.float32)))
        self.assertTrue(np.all(r_vec[1] == np.ones([2, 3], dtype=np.float32)))
        self.assertTrue(np.all(table_model.visitation_counts() == 0))

        table_model.update_transition(rl.one_hot(0, 3), 0, .1,
                                      rl.one_hot(0, 3), False, {})
        table_model.update_transition(rl.one_hot(0, 3), 0, 0, rl.one_hot(1, 3),
                                      False, {})
        table_model.on_simulation_timeout()
        table_model.update_transition(rl.one_hot(1, 3), 1, .5,
                                      rl.one_hot(2, 3), True, {})
        t_mat, r_vec = table_model.get_t_mat_r_vec()
        t_mat_corr = np.array([[[.5, .5, 0.], [0., 1., 0.], [0., 0., 1.]],
                               [[1., 0., 0.], [0., 0., 1.], [0., 0., 1.]]],
                              dtype=np.float32)
        self.assertTrue(np.all(t_mat == t_mat_corr))
        r_vec_corr = np.array([[.05, 1., 1.], [1., 0.5, 1.]], dtype=np.float32)
        self.assertTrue(np.all(r_vec == r_vec_corr))
        visitation_counts_corr = np.array([[2, 0, 0], [0, 1, 0]], dtype=np.int)
        self.assertTrue(
            np.all(table_model.visitation_counts() == visitation_counts_corr))
Exemple #7
0
    def get_t_mat_r_vec(self):
        num_actions, num_states = np.shape(self._v_cnt)
        t_mat = np.zeros(np.shape(self._t_cnt), dtype=np.float32)
        r_vec = np.zeros(np.shape(self._r_sum), dtype=np.float32)

        for a, i in product(range(num_actions), range(num_states)):
            if self._v_cnt[a, i] == 0:
                t_mat[a, i] = rl.one_hot(i, num_states)
                r_vec[a, i] = self._r_default[a, i]
            else:
                t_mat[a, i] = self._t_cnt[a, i] / self._v_cnt[a, i]
                r_vec[a, i] = self._r_sum[a, i] / self._v_cnt[a, i]
        return t_mat, r_vec
def _get_best_total_reward_at_transfer(total_reward, scores, frac=0.05):
    num_abs, num_task = np.shape(total_reward)
    best_num = int(np.ceil(num_abs * frac))

    total_reward_transfer = []
    for task_idx in range(num_task):
        best_part = np.argsort(scores[:, task_idx])[:best_num]
        total_reward_all = np.array([total_reward[i] for i in best_part])
        bit_mask = np.reshape(1. - rl.one_hot(task_idx, num_task), [1, -1])
        bit_mask = bit_mask / np.sum(bit_mask, axis=-1, keepdims=True)
        total_reward_transfer_from_task = np.sum(total_reward_all * bit_mask, axis=-1)
        total_reward_transfer.append(total_reward_transfer_from_task)

    total_reward_transfer = np.stack(total_reward_transfer)
    rand_task_idx = np.random.randint(0, num_task, size=best_num)
    total_reward_transfer = np.stack([total_reward_transfer[j, i] for i, j in enumerate(rand_task_idx)])

    return total_reward_transfer
Exemple #9
0
    def test_lam_partial_model(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np

        table_model = rp.utils.TableModel(num_states=4, num_actions=2)
        table_model.update_transition(rl.one_hot(0, 4), 0, 0, rl.one_hot(2, 4),
                                      False, {})
        table_model.update_transition(rl.one_hot(0, 4), 1, 0, rl.one_hot(0, 4),
                                      False, {})
        table_model.update_transition(rl.one_hot(2, 4), 0, 1, rl.one_hot(2, 4),
                                      False, {})
        table_model.update_transition(rl.one_hot(2, 4), 1, 1, rl.one_hot(2, 4),
                                      False, {})
        task, phi_mat = self._get_four_state_task()
        t_mat, r_vec = table_model.get_t_mat_r_vec()

        m_mat, w_vec = rp.utils.lam_from_mat_visitation_counts(
            t_mat, r_vec, phi_mat, table_model.visitation_counts())
        m_mat_test = np.array([[[0., 1.], [0., 1.]], [[1., 0.], [0., 1.]]],
                              dtype=np.float32)
        w_vec_test = np.array([[0., 1.], [0., 1.]], dtype=np.float32)
        self.assertTrue(np.all(m_mat == m_mat_test))
        self.assertTrue(np.all(w_vec == w_vec_test))
def eval_reward_predictive(task, partition, repeats=5, rollout_depth=10):
    phi_mat = cluster_idx_to_phi_mat(partition)
    t_mat, r_vec = task.get_t_mat_r_vec()
    m_mat, w_vec = lam_from_mat(t_mat, r_vec, phi_mat)

    action_seq_list = np.random.randint(0,
                                        task.num_actions(),
                                        size=[repeats, rollout_depth])
    start_state_list = [
        rl.one_hot(i, task.num_states())
        for i in np.random.randint(0, task.num_states(), size=repeats)
    ]
    start_state_list = np.stack(start_state_list).astype(dtype=np.float32)
    rew_err = []
    for i in range(repeats):
        s = start_state_list[i]
        action_seq = action_seq_list[i]
        rew_original = reward_rollout(t_mat, r_vec, s, action_seq)
        rew_latent = reward_rollout(m_mat, w_vec, np.matmul(s, phi_mat),
                                    action_seq)
        rew_err.append(np.abs(rew_original - rew_latent))

    return np.array(rew_err, dtype=np.float32)
Exemple #11
0
    def test_reset(self):
        import rewardpredictive as rp
        import rlutils as rl
        import numpy as np

        table_model = rp.utils.TableModel(3, 2, max_reward=1)
        table_model.update_transition(rl.one_hot(0, 3), 0, .1,
                                      rl.one_hot(0, 3), False, {})
        table_model.update_transition(rl.one_hot(0, 3), 0, 0, rl.one_hot(1, 3),
                                      False, {})
        table_model.on_simulation_timeout()
        table_model.update_transition(rl.one_hot(1, 3), 1, .5,
                                      rl.one_hot(2, 3), True, {})
        table_model.reset()
        t_mat, r_vec = table_model.get_t_mat_r_vec()
        self.assertTrue(np.all(t_mat[0] == np.eye(3, dtype=np.float32)))
        self.assertTrue(np.all(t_mat[1] == np.eye(3, dtype=np.float32)))
        self.assertTrue(np.all(r_vec[0] == np.ones([2, 3], dtype=np.float32)))
        self.assertTrue(np.all(r_vec[1] == np.ones([2, 3], dtype=np.float32)))
        self.assertTrue(np.all(table_model.visitation_counts() == 0))