Esempio n. 1
0
    def test_unsafe_next_of_already_filled(self):
        """
        Load unsafe next_of transitions with already filled buffer
        """
        buffer_size = 10
        env_dict = {"a": {}}

        rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a")
        rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a")
        rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a")

        a = [1, 2, 3, 4, 5, 6]
        b = [7, 8, 9]

        rb1.add(a=a[:-1], next_a=a[1:])
        rb2.add(a=b[:-1], next_a=b[1:])
        rb3.add(a=b[:-1], next_a=b[1:])

        fname="unsafe_next_of_already.npz"
        rb1.save_transitions(fname, safe=False)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        self.assertEqual(rb1.get_stored_size()+len(b)-1, rb2.get_stored_size())
        self.assertEqual(rb1.get_stored_size()+len(b)-1, rb3.get_stored_size())

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"][len(b)-1:])
        np.testing.assert_allclose(t1["next_a"], t2["next_a"][len(b)-1:])
        np.testing.assert_allclose(t1["a"], t3["a"][len(b)-1:])
        np.testing.assert_allclose(t1["next_a"], t3["next_a"][len(b)-1:])
Esempio n. 2
0
    def test_basic(self):
        """
        Basic Test Case

        Loaded buffer have same transitions with saved one.
        """
        buffer_size = 4
        env_dict = {"a": {}}

        rb1 = ReplayBuffer(buffer_size, env_dict)
        rb2 = ReplayBuffer(buffer_size, env_dict)
        rb3 = ReplayBuffer(buffer_size, env_dict)

        a = [1, 2, 3, 4]

        rb1.add(a=a)

        fname = "basic.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
Esempio n. 3
0
    def test_next_of(self):
        """
        Load next_of transitions with safe mode

        For safe mode, next_of is not neccessary at loaded buffer.
        """
        buffer_size = 10
        env_dict1 = {"a": {}}
        env_dict2 = {"a": {}, "next_a": {}}

        rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a")
        rb2 = ReplayBuffer(buffer_size, env_dict2)
        rb3 = ReplayBuffer(buffer_size, env_dict2)

        a = [1, 2, 3, 4, 5, 6]

        rb1.add(a=a[:-1], next_a=a[1:])

        fname="next_of.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["next_a"], t2["next_a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
        np.testing.assert_allclose(t1["next_a"], t3["next_a"])
Esempio n. 4
0
    def test_stack_compress(self):
        """
        Load stack_compress transitions
        """
        buffer_size = 10
        env_dict = {"a": {"shape": 3}}

        rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a")
        rb2 = ReplayBuffer(buffer_size, env_dict, stack_compress="a")
        rb3 = ReplayBuffer(buffer_size, env_dict, stack_compress="a")

        a = [[1, 2, 3],
             [2, 3, 4],
             [3, 4, 5],
             [4, 5, 6]]

        rb1.add(a=a)

        fname="stack_compress.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
Esempio n. 5
0
    def test_load_Nstep(self):
        """
        Load Nstep transitions
        """
        buffer_size = 10
        env_dict = {"done": {}}
        Nstep = {"size": 3, "gamma": 0.99}

        rb1 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)
        rb2 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)
        rb3 = ReplayBuffer(buffer_size, env_dict, Nstep=Nstep)

        d = [0, 0, 0, 0, 1]

        rb1.add(done=d)
        rb1.on_episode_end()

        fname="Nstep.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["done"], t2["done"])
        np.testing.assert_allclose(t1["done"], t3["done"])
Esempio n. 6
0
File: issue.py Progetto: ymd-h/cpprb
    def test_stack_compress(self):
        bsize = 10
        odim = 2
        ssize = 2
        rb = ReplayBuffer(bsize, {"a": {
            "shape": (odim, ssize)
        }},
                          stack_compress="a")
        a = np.random.rand(odim, bsize + ssize - 1)

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[:, i:i + ssize])
            rb.on_episode_end()

        _a = rb.get_all_transitions()["a"]
        for i in range(bsize):
            with self.subTest(i=i, label="without cache"):
                np.testing.assert_allclose(_a[i], a[:, i:i + ssize])

        for i in range(bsize):
            rb._encode_sample([i])
Esempio n. 7
0
    def test_incompatible_unsafe_stack_compress(self):
        """
        Load incompatible stack_compress transitions with unsafe mode
        """
        buffer_size = 10
        env_dict = {"a": {"shape": 3}}

        rb1 = ReplayBuffer(buffer_size, env_dict, stack_compress="a")
        rb2 = ReplayBuffer(buffer_size, env_dict)
        rb3 = ReplayBuffer(buffer_size, env_dict)

        a = [[1, 2, 3],
             [2, 3, 4],
             [3, 4, 5],
             [4, 5, 6]]

        rb1.add(a=a)

        fname="incompatible_unsafe_stack_compress.npz"
        rb1.save_transitions(fname, safe=False)
        rb2.load_transitions(fname)
        rb3.load_transitions(fname)

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
Esempio n. 8
0
    def test_smaller_buffer(self):
        """
        Load to smaller buffer

        Loaded buffer only stored last buffer_size transitions
        """
        buffer_size1 = 10
        buffer_size2 = 4
        env_dict = {"a": {}}

        rb1 = ReplayBuffer(buffer_size1, env_dict)
        rb2 = ReplayBuffer(buffer_size2, env_dict)
        rb3 = ReplayBuffer(buffer_size2, env_dict)

        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

        fname = "smaller.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"][-buffer_size2:],t2["a"])
Esempio n. 9
0
File: issue.py Progetto: ymd-h/cpprb
    def test_has_next_of(self):
        bsize = 10
        rb = ReplayBuffer(bsize, {"a": {}}, next_of="a")
        a = np.random.rand(bsize + 1)

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])

        rb.clear()

        for i in range(bsize):
            rb.add(a=a[i], next_a=a[i + 1])
            rb.on_episode_end()

        _next_a = np.ravel(rb.get_all_transitions()["next_a"])
        np.testing.assert_allclose(_next_a, a[1:bsize + 1])

        for i in range(bsize):
            rb._encode_sample([i])
Esempio n. 10
0
    def test_unsafe_next_of_stack_compress(self):
        """
        Load next_of and stack_compress transitions
        """
        buffer_size = 10
        env_dict = {"a": {"shape": 3}}

        rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a")
        rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a")
        rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a", stack_compress="a")

        a = [[1, 2, 3],
             [2, 3, 4],
             [3, 4, 5],
             [4, 5, 6],
             [5, 6, 7],
             [6, 7, 8]]

        rb1.add(a=a[:-1], next_a=a[1:])

        fname="unsafe_next_of_stack_compress.npz"
        rb1.save_transitions(fname, safe=False)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["next_a"], t2["next_a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
        np.testing.assert_allclose(t1["next_a"], t3["next_a"])
Esempio n. 11
0
    def test_load_to_filled_buffer(self):
        """
        Load to already filled buffer

        Add to transitions
        """
        buffer_size1 = 10
        buffer_size2 = 10
        env_dict = {"a": {}}

        rb1 = ReplayBuffer(buffer_size1, env_dict)
        rb2 = ReplayBuffer(buffer_size2, env_dict)
        rb3 = ReplayBuffer(buffer_size2, env_dict)

        a = [1, 2, 3, 4]
        b = [5, 6]

        rb1.add(a=a)
        rb2.add(a=b)
        rb3.add(a=b)

        fname="filled.npz"
        rb1.save_transitions(fname)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"][len(b):])
        np.testing.assert_allclose(t1["a"], t3["a"][len(b):])
Esempio n. 12
0
    def test_fulled_unsafe_next_of(self):
        """
        Load with already fulled buffer
        """
        buffer_size = 10
        env_dict = {"a": {}}

        rb1 = ReplayBuffer(buffer_size, env_dict, next_of="a")
        rb2 = ReplayBuffer(buffer_size, env_dict, next_of="a")
        rb3 = ReplayBuffer(buffer_size, env_dict, next_of="a")

        a = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]

        rb1.add(a=a[:-1], next_a=a[1:])

        fname="fulled_unsafe_next_of.npz"
        rb1.save_transitions(fname, safe=False)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["next_a"], t2["next_a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
        np.testing.assert_allclose(t1["next_a"], t3["next_a"])
Esempio n. 13
0
    def test_incompatible_unsafe_next_of(self):
        """
        Load incompatible next_of transitions with unsafe mode
        """
        buffer_size = 10
        env_dict1 = {"a": {}}
        env_dict2 = {"a": {}, "next_a": {}}

        rb1 = ReplayBuffer(buffer_size, env_dict1, next_of="a")
        rb2 = ReplayBuffer(buffer_size, env_dict2)
        rb3 = ReplayBuffer(buffer_size, env_dict2)

        a = [1, 2, 3, 4, 5, 6]

        rb1.add(a=a[:-1], next_a=a[1:])

        fname="unsafe_incompatible_next_of.npz"
        rb1.save_transitions(fname, safe=False)
        rb2.load_transitions(fname)
        rb3.load_transitions(v(1,fname))

        t1 = rb1.get_all_transitions()
        t2 = rb2.get_all_transitions()
        t3 = rb3.get_all_transitions()

        np.testing.assert_allclose(t1["a"], t2["a"])
        np.testing.assert_allclose(t1["next_a"], t2["next_a"])
        np.testing.assert_allclose(t1["a"], t3["a"])
        np.testing.assert_allclose(t1["next_a"], t3["next_a"])
Esempio n. 14
0
    def test_cache_next_of(self):
        stack_size = 3
        episode_len = 5
        rb = ReplayBuffer(32,
                          {"obs": {
                              "shape": (stack_size),
                              "dtype": np.int
                          }},
                          next_of="obs",
                          stack_compress="obs")

        obs = np.arange(episode_len + stack_size + 2, dtype=np.int)
        # [0,1,...,episode_len+stack_size+1]
        obs2 = obs + 3 * episode_len
        # [3*episode_len,...,4*episode_len+stack_size+1]

        # Add 1st episode
        for i in range(episode_len):
            rb.add(obs=obs[i:i + stack_size],
                   next_obs=obs[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Reset environment
        rb.on_episode_end()
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Add 2nd episode
        for i in range(episode_len):
            rb.add(obs=obs2[i:i + stack_size],
                   next_obs=obs2[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), 2 * episode_len)
        for i in range(episode_len):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len):
            with self.subTest(i=i + episode_len):
                np.testing.assert_equal(s["obs"][i + episode_len],
                                        obs2[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i + episode_len],
                                        obs2[i + 1:i + 1 + stack_size])
Esempio n. 15
0
    def test_shuffle_transitions(self):
        rb = ReplayBuffer(64, {"a": {}})

        a = np.arange(64)
        rb.add(a=a)

        s1 = rb.get_all_transitions()["a"]
        s2 = rb.get_all_transitions(shuffle=True)["a"]

        self.assertFalse((s1 == s2).all())

        s = np.intersect1d(s1, s2, assume_unique=True)
        np.testing.assert_allclose(np.ravel(s), np.ravel(s1))
Esempio n. 16
0
def explorer(global_rb,env_dict,is_training_done,queue):
    local_buffer_size = int(1e+2)
    local_rb = ReplayBuffer(local_buffer_size,env_dict)

    model = MyModel()
    env = gym.make("CartPole-v1")

    obs = env.reset()
    while not is_training_done.is_set():
        if not queue.empty():
            w = queue.get()
            model.weights = w

        action = model.get_action(obs)
        next_obs, reward, done, _ = env.step(action)
        local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done)

        if done:
            local_rb.on_episode_end()
            obs = env.reset()
        else:
            obs = next_obs

        if local_rb.get_stored_size() == local_buffer_size:
            local_sample = local_rb.get_all_transitions()
            local_rb.clear()

            absTD = model.abs_TD_error(local_sample)
            global_rb.add(**local_sample,priorities=absTD)
Esempio n. 17
0
    def test_with_one(self):
        buffer_size = 32
        obs_shape = 3
        act_shape = 4

        rb = ReplayBuffer(buffer_size, {
            "obs": {
                "shape": obs_shape
            },
            "act": {
                "shape": act_shape
            },
            "done": {}
        })

        v = {
            "obs": np.ones(shape=obs_shape),
            "act": np.zeros(shape=act_shape),
            "done": 0
        }

        rb.add(**v)

        tx = rb.get_all_transitions()

        for key in ["obs", "act", "done"]:
            with self.subTest(key=key):
                np.testing.assert_allclose(tx[key],
                                           np.asarray(v[key]).reshape((1, -1)))
Esempio n. 18
0
File: issue.py Progetto: ymd-h/cpprb
    def test_python_type(self):
        types = [bool, int, float]

        for d in types:
            with self.subTest(type=d):
                b = ReplayBuffer(10, {"a": {"dtype": d}})
                b.add(a=d(1))
                self.assertEqual(b.get_all_transitions()["a"].dtype, d)
Esempio n. 19
0
File: issue.py Progetto: ymd-h/cpprb
    def test_Nstep_discounts(self):
        buffer_size = 32
        step = 4
        gamma = 0.5
        rb = ReplayBuffer(buffer_size, {"done": {}},
                          Nstep={
                              "size": step,
                              "gamma": gamma
                          })

        rb.add(done=0)
        rb.add(done=0)
        rb.add(done=0)
        self.assertEqual(rb.get_stored_size(), 0)

        rb.add(done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0]]))

        rb.add(done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["done"],
                                   np.asarray([[0], [0]]))
Esempio n. 20
0
File: issue.py Progetto: ymd-h/cpprb
    def test_next_obs(self):
        buffer_size = 32
        nstep = 4
        gamma = 0.99
        rb = ReplayBuffer(buffer_size, {
            "next_obs": {},
            "done": {}
        },
                          Nstep={
                              "size": nstep,
                              "gamma": gamma,
                              "next": "next_obs"
                          })

        rb.add(next_obs=1, done=0)
        rb.add(next_obs=2, done=0)
        rb.add(next_obs=3, done=0)
        rb.add(next_obs=4, done=0)
        rb.add(next_obs=5, done=0)
        np.testing.assert_allclose(rb.get_all_transitions()["next_obs"],
                                   np.asarray([[4], [5]]))

        rb.add(next_obs=6, done=1)
        rb.on_episode_end()

        sample = rb.get_all_transitions()
        np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0],
                                   np.asarray([4, 5, 6]))

        rb.add(next_obs=7, done=0)
        rb.add(next_obs=8, done=0)
        rb.add(next_obs=9, done=0)
        rb.add(next_obs=10, done=1)
        rb.on_episode_end()
        sample = rb.get_all_transitions()
        np.testing.assert_allclose(sample["next_obs"][sample["done"] == 0.0],
                                   np.asarray([4, 5, 6, 10]))
Esempio n. 21
0
File: issue.py Progetto: ymd-h/cpprb
    def test_stack(self):
        rb = ReplayBuffer(3, {"a": {"shape": 2}}, stack_compress="a")

        # 1st iteration: Nothing special
        rb.add(a=[0, 1])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[0, 1]]))

        rb.add(a=[1, 2])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[0, 1], [1, 2]]))

        rb.add(a=[2, 3])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[0, 1], [1, 2], [2, 3]]))

        # 2nd iteration: Cache
        rb.add(a=[3, 4])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[3, 4], [1, 2], [2, 3]]))

        rb.add(a=[4, 5])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[3, 4], [4, 5], [2, 3]]))

        rb.add(a=[5, 6])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[3, 4], [4, 5], [5, 6]]))

        # 3rd iteration: Clean up cache beforehand and set new cache
        rb.add(a=[6, 7])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[6, 7], [4, 5], [5, 6]]))

        rb.add(a=[7, 8])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[6, 7], [7, 8], [5, 6]]))

        rb.add(a=[8, 9])
        np.testing.assert_allclose(rb.get_all_transitions()["a"],
                                   np.asarray([[6, 7], [7, 8], [8, 9]]))
Esempio n. 22
0
File: issue.py Progetto: ymd-h/cpprb
    def test_dtype_check(self):
        types = [
            np.bool_, np.bool8, np.byte, np.short, np.intc, np.int_,
            np.longlong, np.intp, np.int8, np.int16, np.int32, np.int64,
            np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong, np.uintp,
            np.uint8, np.uint16, np.uint32, np.uint64, np.half, np.single,
            np.double, np.float_, np.longfloat, np.float16, np.float32,
            np.float64, np.csingle, np.complex_, np.clongfloat, np.complex64,
            np.complex128
        ]

        for d in types:
            with self.subTest(type=d):
                b = ReplayBuffer(10, {"a": {"dtype": d}})
                b.add(a=np.ones(1, dtype=d))
                self.assertEqual(b.get_all_transitions()["a"].dtype, d)
Esempio n. 23
0
    def test_with_empty(self):
        buffer_size = 32
        obs_shape = 3
        act_shape = 4

        rb = ReplayBuffer(buffer_size, {
            "obs": {
                "shape": obs_shape
            },
            "act": {
                "shape": act_shape
            },
            "done": {}
        })

        tx = rb.get_all_transitions()

        for key in ["obs", "act", "done"]:
            with self.subTest(key=key):
                self.assertEqual(tx[key].shape[0], 0)
Esempio n. 24
0
class RainbowAgent:
    """Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment
        memory (PrioritizedReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
                           state, action, reward, next_state, done
        v_min (float): min value of support
        v_max (float): max value of support
        atom_size (int): the unit number of support
        support (torch.Tensor): support for categorical dqn
        use_n_step (bool): whether to use n_step memory
        n_step (int): step number to calculate n-step td error
        memory_n (ReplayBuffer): n-step replay buffer
    """

    def __init__(
        self, 
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",

    ):
        """Initialization.
        
        Args:
            env (gym.Env): openAI Gym environment
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n
        
        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma
        # NoisyNet: All attributes related to epsilon are removed

        #produces a unique timestamp for each run 
        run_timestamp=str(
        #returns number of day and number of month
        str(time.localtime(time.time())[2]) + "_" +
        str(time.localtime(time.time())[1]) + "_" +
        #returns hour, minute and second
        str(time.localtime(time.time())[3]) + "_" +
        str(time.localtime(time.time())[4]) + "_" +
        str(time.localtime(time.time())[5])
        )

        #Will write scalars that can be visualized using tensorboard in the directory "runLogs/timestamp"
        self.writer = SummaryWriter("runLogs/" + run_timestamp)


        # device: cpu / gpu
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu"
        )
        print(self.device)
        
        # PER
        # memory for 1-step Learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )
        
        # memory for N-step Learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )
            
        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = torch.linspace(
            self.v_min, self.v_max, self.atom_size
        ).to(self.device)

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support
        ).to(self.device)
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        self.dqn_target.eval()
        
        # optimizer
        self.optimizer = optim.Adam(self.dqn.parameters(),0.0001)

        # transition to store in memory
        self.transition = list()
        
        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        # self.tensorboard = RainbowTensorBoard(
        #     log_dir="single_joint_logs/{}-{}".format(
        #         model_name,
        #         datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
        #     )
        # )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p


    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = self.dqn(
            torch.FloatTensor(state).to(self.device)
        ).argmax()
        selected_action = selected_action.detach().cpu().numpy()
        
        if not self.is_test:

            self.transition = [state, selected_action]
        

        return selected_action


    def step(self, action: np.ndarray, score:int) -> Tuple[np.ndarray, np.float64, bool]:
        """Take an action and return the response of the env."""
        next_state, reward, done, _ = self.env.step(action,score)

        if not self.is_test:
            self.transition += [reward, next_state, done]
            
            # N-step transition
            if self.use_n_step:
                idx = self.memory_n.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], self.transition)
                    )
                )
                one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None

            # 1-step transition
            else:
                one_step_transition = self.transition

            # add a single step transition
            if one_step_transition:
                self.memory.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition)
                    )
                )
    
        return next_state, reward, done


    def update_model(self,frame_idx:int) -> torch.Tensor:
        """Update the model by gradient descent.
        shape of elementwise_loss = [128,51]
        shape of loss = ([])
        shape of weights ([128,1)]
        """
        # PER needs beta to calculate weights
        samples = self.memory.sample(self.batch_size, beta=self.beta)
        weights = torch.FloatTensor(
            samples["weights"].reshape(-1, 1)
        ).to(self.device)
        indices = samples["indexes"]
        #rospy.loginfo(samples.keys())
        #rospy.loginfo(weights.shape)
        #rospy.loginfo(indices.shape())

        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))
        
        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)
        
        # PER: importance sampling before average
        loss = torch.mean(elementwise_loss * weights)
        
        self.writer.add_scalar('update_model/Lossv0', loss.detach().item(),frame_idx )
        
        # N-step Learning loss
        # we are gonna combine 1-step loss and n-step loss so as to
        # prevent high-variance. The original rainbow employs n-step loss only.
        if self.use_n_step:
            gamma = self.gamma ** self.n_step
            samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()}
            elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
            elementwise_loss += elementwise_loss_n_loss
            
            #rospy.loginfo(elementwise_loss_n_loss.shape)
            #rospy.loginfo(elementwise_loss.shape)

            # PER: importance sampling before average
            loss = torch.mean(elementwise_loss * weights)

        
        rospy.loginfo(
            f"{elementwise_loss}"
            )
        self.optimizer.zero_grad()
        self.writer.add_scalar('update_model/Lossv1', loss.detach().item(),frame_idx )
        #From pytorch doc: backward() Computes the gradient of current tensor w.r.t. graph leaves.
        #self.writer.add_image("loss gradient before", loss, frame_idx)
        loss.backward()
        #self.writer.add_image("loss gradient after", loss, frame_idx)
        self.writer.add_scalar('update_model/Lossv2', loss.detach().item(),frame_idx )
        clip_grad_norm_(self.dqn.parameters(), 10.0)
        self.optimizer.step()
        
        # PER: update priorities
        loss_for_prior = elementwise_loss.detach().cpu().numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)
        
        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()
        
        #rospy.loginfo("second")
        #rospy.loginfo(loss.shape)

        #rospy.loginfo("loss dimension = " + loss.ndim()  )   
        #rospy.loginfo("loss = " + str(loss.detach().item()) + "type = " + str(type(loss.detach().item())  )   )   
        self.writer.add_scalar('update_model/Loss', loss.detach().item(),frame_idx )
        return loss.detach().item()


    def train(self, num_frames: int):
        """Train the agent."""
        self.is_test = False
        
        state = self.env.reset()
        update_cnt = 0
        losses = []
        scores = []
        score = 0

        for frame_idx in tqdm(range(1, num_frames + 1)):

            action = self.select_action(state)
            next_state, reward, done = self.step(action,score)

            state = next_state
            score += reward
            
            # NoisyNet: removed decrease of epsilon
            
            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            # if episode ends
            if done:
                #rospy.loginfo("logging for done")
                self.writer.add_scalar('train/score', score, frame_idx)
                self.writer.add_scalar('train/final_epsilon', state[6], frame_idx)
                self.writer.add_scalar('train/epsilon_p', state[7], frame_idx)
                state = self.env.reset()
                scores.append(score)
                score = 0

            # if training is ready
            if self.memory.get_stored_size() >= self.batch_size:
                #frame_id given as argument for logging by self.writer. 
                #rospy.loginfo("frame_idx= " + str(frame_idx) + "type = " + str(type(frame_idx)))
                loss = self.update_model(frame_idx)

                losses.append(loss)
                update_cnt += 1
                
                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update(loss)

        self.env.close()


    def test(self) -> List[np.ndarray]:
        """Test the agent."""
        self.is_test = True
        
        state = self.env.reset()
        done = False
        score = 0
        
        frames = []
        while not done:
            frames.append(self.env.render(mode="rgb_array"))
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
        
        print("score: ", score)
        self.env.close()
        
        return frames


    def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> torch.Tensor:
        """Return categorical dqn loss."""
        device = self.device  # for shortening the following lines
        state = torch.FloatTensor(samples["obs"]).to(device)
        next_state = torch.FloatTensor(samples["next_obs"]).to(device)
        action = torch.LongTensor(samples["act"]).to(device)
        reward = torch.FloatTensor(np.array(samples["rew"]).reshape(-1, 1)).to(device)
        done = torch.FloatTensor(np.array(samples["done"]).reshape(-1, 1)).to(device)
        
        # Categorical DQN algorithm
        delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

        with torch.no_grad():
            # Double DQN
            next_action = self.dqn(next_state).argmax(1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = next_dist[range(self.batch_size), next_action]

            t_z = reward + (1 - done) * gamma * self.support
            t_z = t_z.clamp(min=self.v_min, max=self.v_max)
            b = (t_z - self.v_min) / delta_z
            l = b.floor().long()
            u = b.ceil().long()

            offset = (
                torch.linspace(
                    0, (self.batch_size - 1) * self.atom_size, self.batch_size
                ).long()
                .unsqueeze(1)
                .expand(self.batch_size, self.atom_size)
                .to(self.device)
            )

            proj_dist = torch.zeros(next_dist.size(), device=self.device)
            proj_dist.view(-1).index_add_(
                0, (l + offset).view(-1), (next_dist * (u.float() - b)).view(-1)
            )
            proj_dist.view(-1).index_add_(
                0, (u + offset).view(-1), (next_dist * (b - l.float())).view(-1)
            )
            print(f"Next Action : {next_action}\n Next Dist : {next_dist}\n")

        dist = self.dqn.dist(state)
        log_p = torch.log(dist[range(self.batch_size), action])
        elementwise_loss = -(proj_dist * log_p).sum(1)
        print(f"Proj Dist : {proj_dist}\n Dist : {dist}\n Log_p : {log_p}\n")
        if torch.isnan(elementwise_loss[0][0]):
            exit()

        return elementwise_loss


    def _target_hard_update(self,loss):
        """Hard update: target <- local."""
        self.dqn_target.load_state_dict(self.dqn.state_dict())
        #torch.save(self.dqn.state_dict(),str("checkpoint_"+str(time.time())))

        torch.save({
            'model_state_dict': self.dqn.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'loss': loss,
            }, str("checkpoints/checkpoint_"+str(time.time())))
Esempio n. 25
0
def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True, record=False, record_project= 'benchmarking', record_name = 'trained' , data_path='', config_name='test', max_len_rb=100, benchmark=False, log_prefix=''):
    assert env is not None, \
        "Environment not found!\n\n It looks like the environment wasn't saved, " + \
        "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \
        "page on Experiment Outputs for how to handle this situation."

    logger = EpochLogger()
    o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0
    ep_cost = 0
    local_steps_per_epoch = int(4000 / num_procs())

    obs_dim = env.observation_space.shape
    act_dim = env.action_space.shape

    rew_mov_avg_10 = []
    cost_mov_avg_10 = []

    if benchmark:
        ep_costs = []
        ep_rewards = []

    if record:
        wandb.login()
        # 4 million env interactions
        wandb.init(project=record_project, name=record_name)

        rb = ReplayBuffer(size=10000,
                          env_dict={
                              "obs": {"shape": obs_dim},
                              "act": {"shape": act_dim},
                              "rew": {},
                              "next_obs": {"shape": obs_dim},
                              "done": {}})

        # columns = ['observation', 'action', 'reward', 'cost', 'done']
        # sim_data = pd.DataFrame(index=[0], columns=columns)

    while n < num_episodes:
        if render:
            env.render()
            time.sleep(1e-3)

        a = get_action(o)
        next_o, r, d, info = env.step(a)

        if record:
            # buf.store(next_o, a, r, None, info['cost'], None, None, None)
            done_int = int(d==True)
            rb.add(obs=o, act=a, rew=r, next_obs=next_o, done=done_int)

        ep_ret += r
        ep_len += 1
        ep_cost += info['cost']

        # Important!
        o = next_o

        if d or (ep_len == max_ep_len):
            # finish recording and save csv
            if record:
                rb.on_episode_end()

                # make directory if does not exist
                if not os.path.exists(data_path + config_name + '_episodes'):
                    os.makedirs(data_path + config_name + '_episodes')

                # buf = CostPOBuffer(obs_dim, act_dim, local_steps_per_epoch, 0.99, 0.99)

            if len(rew_mov_avg_10) >= 25:
                rew_mov_avg_10.pop(0)
                cost_mov_avg_10.pop(0)

            rew_mov_avg_10.append(ep_ret)
            cost_mov_avg_10.append(ep_cost)

            mov_avg_ret = np.mean(rew_mov_avg_10)
            mov_avg_cost = np.mean(cost_mov_avg_10)

            expert_metrics = {log_prefix + 'episode return': ep_ret,
                              log_prefix + 'episode cost': ep_cost,
                              # 'cumulative return': cum_ret,
                              # 'cumulative cost': cum_cost,
                              log_prefix + '25ep mov avg return': mov_avg_ret,
                              log_prefix + '25ep mov avg cost': mov_avg_cost
                              }

            if benchmark:
                ep_rewards.append(ep_ret)
                ep_costs.append(ep_cost)

            wandb.log(expert_metrics)
            logger.store(EpRet=ep_ret, EpLen=ep_len, EpCost=ep_cost)
            print('Episode %d \t EpRet %.3f \t EpLen %d \t EpCost %d' % (n, ep_ret, ep_len, ep_cost))
            o, r, d, ep_ret, ep_len, ep_cost = env.reset(), 0, False, 0, 0, 0
            n += 1


    logger.log_tabular('EpRet', with_min_and_max=True)
    logger.log_tabular('EpLen', average_only=True)
    logger.dump_tabular()

    if record:
        print("saving final buffer")
        bufname_pk = data_path + config_name + '_episodes/sim_data_' + str(int(num_episodes)) + '_buffer.pkl'
        file_pi = open(bufname_pk, 'wb')
        pickle.dump(rb.get_all_transitions(), file_pi)
        wandb.finish()

        return rb

    if benchmark:
        return ep_rewards, ep_costs
Esempio n. 26
0
class LearnerBase(tf.Module):
    # PUBLIC
    def __init__(self,
                 model,
                 filename=None,
                 bufferSize=264,
                 numEpochs=100,
                 batchSize=30,
                 log=False,
                 logPath=None):
        self.model = model
        self.sDim = model.get_state_dim()
        self.aDim = model.get_action_dim()
        self.optimizer = tf.optimizers.Adam(learning_rate=0.5)
        self.rb = ReplayBuffer(bufferSize,
                               env_dict={
                                   "obs": {
                                       "shape": (self.sDim, 1)
                                   },
                                   "act": {
                                       "shape": (self.aDim, 1)
                                   },
                                   "next_obs": {
                                       "shape": (self.sDim, 1)
                                   }
                               })
        self.numEpochs = numEpochs
        self.batchSize = batchSize

        if filename is not None:
            self.load_rb(filename)

        self.log = log
        self.step = 0

        if self.log:
            stamp = datetime.now().strftime("%Y.%m.%d-%H:%M:%S")
            self.logdir = os.path.join(logPath, "learner", stamp)
            self.writer = tf.summary.create_file_writer(self.logdir)
            self._save_graph()

    def load_rb(self, filename):
        self.rb.load_transitions(filename)

    def add_rb(self, x, u, xNext):
        self.rb.add(obs=x, act=u, next_obs=xNext)

    def train(self, X, y, batchSize=-1, epoch=1, learninRate=0.1, kfold=None):
        for e in range(epoch):
            if batchSize == -1:
                batchLoss = self._train_step(X, y)
                if self.log:
                    with self.writer.as_default():
                        if kfold is not None:
                            scope = "epoch{}/batch{}/lr{}/loss".format(
                                epoch, batchSize, learninRate)
                        else:
                            scope = "Loss"
                        tf.summary.scalar(scope, batchLoss, self.step)
                        self.step += 1
                pass
            for i in range(0, X.shape[0], batchSize):
                batchLoss = self._train_step(X[i:i + batchSize],
                                             y[i:i + batchSize])
                if self.log:
                    with self.writer.as_default():
                        if kfold is not None:
                            scope = "epoch{}/batch{}/lr{}/loss_fold{}".format(
                                epoch, batchSize, learninRate, kfold)
                        else:
                            scope = "Loss"
                        tf.summary.scalar(scope, batchLoss, self.step)
                        self.step += 1

    def rb_trans(self):
        return self.rb.get_all_transitions().copy()

    def save_rb(self, filename):
        self.rb.save_transitions(filename)

    def save_params(self, step):
        self.model.save_params(self.logdir, step)

    def grid_search(self, trajs, actionSeqs):
        init_weights = self.model.get_weights()

        learningRate = np.linspace(0.0001, 0.1, 10)
        batchSize = np.array([-1])
        epoch = np.array([100, 500, 1000])

        mean = []
        for lr in learningRate:
            for bs in batchSize:
                for e in epoch:
                    fold = self.k_fold_validation(learningRate=lr,
                                                  batchSize=bs,
                                                  epoch=e,
                                                  k=10)
                    mean.append(np.mean(fold))
                    print("*" * 5, " Grid ", 5 * "*")
                    print("lr: ", lr)
                    print("bs: ", bs)
                    print("e: ", e)
                    print("fold: ", fold)
                    print("mean: ", np.mean(fold))

                    self.train_all(learningRate=lr, batchSize=bs, epoch=e)
                    err = self.validate(actionSeqs, trajs)
                    print("validation error: ", err.numpy())
                    self.model.update_weights(init_weights, msg=False)
        print("Best mean:", np.max(mean))

    def train_all(self, learningRate=0.1, batchSize=32, epoch=100):
        self.optimizer = tf.optimizers.Adam(learning_rate=learningRate)
        data = self.rb_trans()
        (X, y) = self.model.prepare_training_data(data['obs'],
                                                  data['next_obs'],
                                                  data['act'])
        self.step = 0
        self.train(X,
                   y,
                   batchSize=batchSize,
                   epoch=epoch,
                   learninRate=learningRate)

    def k_fold_validation(self,
                          k=10,
                          learningRate=0.1,
                          batchSize=32,
                          epoch=100):
        # First get all the data
        self.optimizer = tf.optimizers.Adam(learning_rate=learningRate)
        data = self.rb_trans()
        (X, y) = self.model.prepare_training_data(data['obs'],
                                                  data['next_obs'],
                                                  data['act'])
        kfold = KFold(n_splits=k, shuffle=True)

        init_weights = self.model.get_weights()
        fold = []
        X = X.numpy()
        y = y.numpy()
        i = 0
        for train, test in kfold.split(X, y):
            self.step = 0
            self.train(X[train],
                       y[train],
                       batchSize=batchSize,
                       epoch=epoch,
                       learninRate=learningRate,
                       kfold=i)
            self.model.update_weights(init_weights, msg=False)
            lossFold = self.evaluate(X[test], y[test])
            fold.append(lossFold.numpy())
            i += 1

        self.model.update_weights(init_weights, msg=False)
        return fold

    def evaluate(self, X, y):
        pred = self.model._predict_nn("Eval", np.squeeze(X, axis=-1))
        loss = tf.reduce_mean(tf.math.squared_difference(pred, y), name="loss")
        return loss

    def plot_seq(self, traj, gtTraj):
        fig, axs = plt.subplots(figsize=(20, 10), nrows=2, ncols=8)
        # Position
        axs[0, 0].plot(traj[:, 0])
        axs[0, 0].plot(gtTraj[:, 0])

        axs[0, 1].plot(traj[:, 1])
        axs[0, 1].plot(gtTraj[:, 1])

        axs[0, 2].plot(traj[:, 2])
        axs[0, 2].plot(gtTraj[:, 2])

        # Quaternion
        axs[0, 3].plot(traj[:, 3])
        axs[0, 3].plot(gtTraj[:, 3])

        axs[0, 4].plot(traj[:, 4])
        axs[0, 4].plot(gtTraj[:, 4])

        axs[0, 5].plot(traj[:, 5])
        axs[0, 5].plot(gtTraj[:, 5])

        axs[0, 6].plot(traj[:, 6])
        axs[0, 6].plot(gtTraj[:, 6])

        # Lin Vel
        axs[1, 0].plot(traj[:, 0])
        axs[1, 0].plot(gtTraj[:, 0])

        axs[1, 1].plot(traj[:, 1])
        axs[1, 1].plot(gtTraj[:, 1])

        axs[1, 2].plot(traj[:, 2])
        axs[1, 2].plot(gtTraj[:, 2])

        # Ang vel
        axs[1, 3].plot(traj[:, 3])
        axs[1, 3].plot(gtTraj[:, 3])

        axs[1, 4].plot(traj[:, 4])
        axs[1, 4].plot(gtTraj[:, 4])

        axs[1, 5].plot(traj[:, 5])
        axs[1, 5].plot(gtTraj[:, 5])
        plt.show()

    def validate(self, actionSeqs, gtTrajs):
        '''
            computes the error of the model for a number of trajectories with
            the matching action sequences.

            - input:
            --------
                - acitonSeqs: Tensor of the action sequences.
                    Shape [k, tau, 6, 1]
                
                - gtTrajs: Tensor of the ground truth trajectories.
                    Shape [k, tau, 13, 1]

            - output:
            ---------
                - L(nn(actionSeqs), trajs), the loss between the predicted trajectory
                and the ground truth trajectory.
        '''
        tau = actionSeqs.shape[1]
        k = actionSeqs.shape[0]
        state = np.expand_dims(gtTrajs[:, 0], axis=-1)
        trajs = [np.expand_dims(state, axis=1)]
        # PAY ATTENTION TO THE FOR LOOPS WITH @tf.function.
        for i in range(tau - 1):
            with tf.name_scope("Rollout_" + str(i)):
                with tf.name_scope("Prepare_data_" + str(i)) as pd:
                    # make the action a [1, 6, 1] tensor
                    action = np.expand_dims(actionSeqs[:, i], axis=-1)
                with tf.name_scope("Step_" + str(i)) as s:
                    nextState = self.model.build_step_graph(s, state, action)
            state = nextState
            trajs.append(np.expand_dims(state, axis=1))

        trajs = np.squeeze(np.concatenate(trajs, axis=1), axis=-1)
        err = tf.linalg.norm(tf.subtract(trajs, gtTrajs)) / k

        self.plot_seq(trajs[0], gtTrajs[0])
        return err

    # PRIVATE
    def _train_step(self, X, y):
        # If batchSize = -1, feed in the entire batch
        with tf.GradientTape() as tape:
            pred = self.model._predict_nn("train", np.squeeze(X, axis=-1))
            loss = tf.reduce_mean(tf.math.squared_difference(pred, y),
                                  name="loss")
            grads = tape.gradient(loss, self.model.weights())
            self.optimizer.apply_gradients(zip(grads, self.model.weights()))
            return loss

    def _save_graph(self):
        state = tf.zeros((1, self.model.get_state_dim(), 1), dtype=tf.float64)
        action = tf.zeros((1, self.model.get_action_dim(), 1),
                          dtype=tf.float64)
        with self.writer.as_default():
            graph = tf.function(
                self.model.build_step_graph).get_concrete_function(
                    "graph", state, action).graph
            # visualize
            summary_ops_v2.graph(graph.as_graph_def())
Esempio n. 27
0
class HindsightReplayBuffer:
    """
    Replay Buffer class for Hindsight Experience Replay

    Ref: https://arxiv.org/abs/1707.01495
    """
    def __init__(self,
                 size: int,
                 env_dict: Dict,
                 max_episode_len: int,
                 reward_func: Callable,
                 *,
                 goal_func: Optional[Callable] = None,
                 goal_shape: Optional[Iterable[int]] = None,
                 state: str = "obs",
                 action: str = "act",
                 next_state: str = "next_obs",
                 strategy: str = "future",
                 additional_goals: int = 4,
                 prioritized=True,
                 **kwargs):
        """
        Initialize HindsightReplayBuffer

        Parameters
        ----------
        size : int
            Buffer Size
        env_dict : dict of dict
            Dictionary specifying environments. The keies of env_dict become
            environment names. The values of env_dict, which are also dict,
            defines "shape" (default 1) and "dtypes" (fallback to `default_dtype`)
        max_episode_len : int
            Maximum episode length.
        reward_func : Callable[[np.ndarray, np.ndarray, np.ndarray], np.ndarray]
            Batch calculation of reward function SxAxG -> R.
        goal_func : Callable[[np.ndarray], np.ndarray], optional
            Batch extraction function for goal from state: S->G.
            If ``None`` (default), identity function is used (goal = state).
        goal_shape : Iterable[int], optional
            Shape of goal. If ``None`` (default), state shape is used.
        state : str, optional
            State name in ``env_dict``. The default is "obs".
        action : str, optional
            Action name in ``env_dict``. The default is "act".
        next_state : str, optional
            Next state name in ``env_dict``. The default is "next_obs".
        strategy : ["future", "episode", "random", "final"], optional
            Goal sampling strategy.
            "future" selects one of the future states in the same episode.
            "episode" selects states in the same episode.
            "random" selects from the all states in replay buffer.
            "final" selects the final state in the episode. For "final",
            ``additonal_goals`` is ignored.
            The default is "future"
        additional_goals : int, optional
            Number of additional goals. The default is ``4``.
        prioritized : bool, optional
            Whether use Prioritized Experience Replay. The default is ``True``.
        """
        self.max_episode_len = max_episode_len
        self.reward_func = reward_func
        self.goal_func = goal_func or (lambda s: s)

        self.state = state
        self.action = action
        self.next_state = next_state

        self.strategy = strategy
        known_strategy = ["future", "episode", "random", "final"]
        if self.strategy not in known_strategy:
            raise ValueError(f"Unknown Strategy: {strategy}. " +
                             f"Known Strategies: {known_strategy}")

        self.additional_goals = additional_goals
        if self.strategy == "final":
            self.additional_goals = 1

        self.prioritized = prioritized

        if goal_shape:
            goal_dict = {**env_dict[state], "shape": goal_shape}
            self.goal_shape = np.array(goal_shape, ndmin=1)
        else:
            goal_dict = env_dict[state]
            self.goal_shape = np.array(env_dict[state].get("shape", 1),
                                       ndmin=1)
        RB = PrioritizedReplayBuffer if self.prioritized else ReplayBuffer
        self.rb = RB(size, {
            **env_dict, "rew": {},
            "goal": goal_dict
        }, **kwargs)

        self.episode_rb = ReplayBuffer(self.max_episode_len, env_dict)

        self.rng = np.random.default_rng()

    def add(self, **kwargs):
        r"""Add transition(s) into replay buffer.

        Multple sets of transitions can be added simultaneously.

        Parameters
        ----------
        **kwargs : array like or float or int
            Transitions to be stored.
        """
        if self.episode_rb.get_stored_size() >= self.max_episode_len:
            raise ValueError("Exceed Max Episode Length")
        self.episode_rb.add(**kwargs)

    def sample(self, batch_size: int, **kwargs):
        r"""Sample the stored transitions randomly with speciped size

        Parameters
        ----------
        batch_size : int
            sampled batch size

        Returns
        -------
        sample : dict of ndarray
            Batch size of sampled transitions, which might contains
            the same transition multiple times.
        """
        return self.rb.sample(batch_size, **kwargs)

    def on_episode_end(self, goal):
        """
        Terminate the current episode and set hindsight goal

        Paremeters
        ----------
        goal : array-like
            Original goal state of this episode.
        """
        episode_len = self.episode_rb.get_stored_size()
        if episode_len == 0:
            return None

        trajectory = self.episode_rb.get_all_transitions()
        add_shape = (trajectory[self.state].shape[0], *self.goal_shape)

        goal = np.broadcast_to(np.asarray(goal), add_shape)
        rew = self.reward_func(trajectory[self.next_state],
                               trajectory[self.action], goal)

        self.rb.add(**trajectory, goal=goal, rew=rew)

        if self.strategy == "future":
            idx = np.zeros((self.additional_goals, episode_len),
                           dtype=np.int64)
            for i in range(episode_len):
                idx[:, i] = self.rng.integers(low=i,
                                              high=episode_len,
                                              size=self.additional_goals)
            for i in range(self.additional_goals):
                goal = self.goal_func(trajectory[self.next_state][idx[i]])
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], goal)
                self.rb.add(**trajectory, rew=rew, goal=goal)
        elif self.strategy == "episode":
            idx = self.rng.integers(low=0,
                                    high=episode_len,
                                    size=(self.additional_goals, episode_len))
            for _i in idx:
                goal = self.goal_func(trajectory[self.next_state][_i])
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], goal)
                self.rb.add(**trajectory, rew=rew, goal=goal)
        elif self.strategy == "final":
            goal = self.goal_func(
                np.broadcast_to(trajectory[self.next_state][-1],
                                trajectory[self.next_state].shape))
            rew = self.reward_func(trajectory[self.next_state],
                                   trajectory[self.action], goal)
            self.rb.add(**trajectory, rew=rew, goal=goal)
        else:  # random
            # Note 1:
            #   We should not prioritize goal selection,
            #   so that we manually create indices.
            # Note 2:
            #   Since we cannot access internal data directly,
            #   we have to extract set of transitions.
            #   Although this has overhead, it is fine
            #   becaue "random" strategy is used only for
            #   strategy comparison.
            idx = self.rng.integers(low=0,
                                    high=self.rb.get_stored_size(),
                                    size=self.additional_goals * episode_len)
            goal = self.goal_func(self.rb._encode_sample(idx)[self.next_state])
            goal = goal.reshape(
                (self.additional_goals, episode_len, *(goal.shape[1:])))
            for g in goal:
                rew = self.reward_func(trajectory[self.next_state],
                                       trajectory[self.action], g)
                self.rb.add(**trajectory, rew=rew, goal=g)

        self.episode_rb.clear()
        self.rb.on_episode_end()

    def clear(self):
        """
        Clear replay buffer
        """
        self.rb.clear()
        self.episode_rb.clear()

    def get_stored_size(self):
        """
        Get stored size

        Returns
        -------
        int
            stored size
        """
        return self.rb.get_stored_size()

    def get_buffer_size(self):
        """
        Get buffer size

        Returns
        -------
        int
            buffer size
        """
        return self.rb.get_buffer_size()

    def get_all_transitions(self, shuffle: bool = False):
        r"""
        Get all transitions stored in replay buffer.

        Parameters
        ----------
        shuffle : bool, optional
            When True, transitions are shuffled. The default value is False.

        Returns
        -------
        transitions : dict of numpy.ndarray
            All transitions stored in this replay buffer.
        """
        return self.rb.get_all_transitions(shuffle)

    def update_priorities(self, indexes, priorities):
        """
        Update priorities

        Parameters
        ----------
        indexes : array_like
            indexes to update priorities
        priorities : array_like
            priorities to update

        Raises
        ------
        TypeError: When ``indexes`` or ``priorities`` are ``None``
        ValueError: When this buffer is constructed with ``prioritized=False``
        """
        if not self.prioritized:
            raise ValueError("Buffer is constructed without PER")

        self.rb.update_priorities(indexes, priorities)

    def get_max_priority(self):
        """
        Get max priority

        Returns
        -------
        float
            Max priority of stored priorities

        Raises
        ------
        ValueError: When this buffer is constructed with ``prioritied=False``
        """
        if not self.prioritized:
            raise ValueError("Buffer is constructed without PER")

        return self.rb.get_max_priority()
Esempio n. 28
0
    def test_smaller_episode_than_stack_frame(self):
        """
        `on_episode_end()` caches stack size.

        When episode length is smaller than stack size,
        `on_episode_end()` must avoid caching from previous episode.

        Since cache does not wraparound, this bug does not happen
        at the first episode.

        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/108
        Ref: https://gitlab.com/ymd_h/cpprb/-/issues/110
        """
        stack_size = 4
        episode_len1 = 5
        episode_len2 = 2
        rb = ReplayBuffer(32,
                          {"obs": {
                              "shape": (stack_size),
                              "dtype": np.int
                          }},
                          next_of="obs",
                          stack_compress="obs")

        obs = np.arange(episode_len1 + stack_size + 2, dtype=np.int)
        obs2 = np.arange(episode_len2 + stack_size + 2, dtype=np.int) + 100

        self.assertEqual(rb.get_current_episode_len(), 0)

        # Add 1st episode
        for i in range(episode_len1):
            rb.add(obs=obs[i:i + stack_size],
                   next_obs=obs[i + 1:i + 1 + stack_size])

        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1)
        self.assertEqual(rb.get_current_episode_len(), episode_len1)
        for i in range(episode_len1):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Reset environment
        rb.on_episode_end()
        self.assertEqual(rb.get_current_episode_len(), 0)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1)
        for i in range(episode_len1):
            with self.subTest(i=i):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])

        # Add 2nd episode
        for i in range(episode_len2):
            rb.add(obs=obs2[i:i + stack_size],
                   next_obs=obs2[i + 1:i + 1 + stack_size])

        self.assertEqual(rb.get_current_episode_len(), episode_len2)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2)
        for i in range(episode_len1):
            with self.subTest(i=i, v="obs"):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
            with self.subTest(i=i, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len2):
            with self.subTest(i=i + episode_len1, v="obs"):
                np.testing.assert_equal(s["obs"][i + episode_len1],
                                        obs2[i:i + stack_size])
            with self.subTest(i=i + episode_len1, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i + episode_len1],
                                        obs2[i + 1:i + 1 + stack_size])

        rb.on_episode_end()
        self.assertEqual(rb.get_current_episode_len(), 0)
        s = rb.get_all_transitions()
        self.assertEqual(rb.get_stored_size(), episode_len1 + episode_len2)
        for i in range(episode_len1):
            with self.subTest(i=i, v="obs"):
                np.testing.assert_equal(s["obs"][i], obs[i:i + stack_size])
            with self.subTest(i=i, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i],
                                        obs[i + 1:i + 1 + stack_size])
        for i in range(episode_len2):
            with self.subTest(i=i + episode_len1, v="obs"):
                np.testing.assert_equal(s["obs"][i + episode_len1],
                                        obs2[i:i + stack_size])
            with self.subTest(i=i + episode_len1, v="next_obs"):
                np.testing.assert_equal(s["next_obs"][i + episode_len1],
                                        obs2[i + 1:i + 1 + stack_size])
Esempio n. 29
0
class RainbowAgent:
    """
    Rainbow Agent interacting with environment.
    
    Attribute:
        env (gym.Env): openAI Gym environment (connected to Gazebo node)
        memory (PrioritizedReplayBuffer): replay memory to store transitions
        batch_size (int): batch size for sampling
        target_update (int): period for target model's hard update
        gamma (float): discount factor
        dqn (Network): model to train and select actions
        dqn_target (Network): target model to update
        optimizer (torch.optim): optimizer for training dqn
        transition (list): transition information including 
            state, action, reward, next_state, done
        v_min (float): min value of support
        v_max (float): max value of support
        atom_size (int): the unit number of support
        support (torch.Tensor): support for categorical dqn
        use_n_step (bool): whether to use n_step memory
        n_step (int): step number to calculate n-step td error
        memory_n (ReplayBuffer): n-step replay buffer
    """
    def __init__(
        self,
        env: gym.Env,
        memory_size: int,
        batch_size: int,
        target_update: int,
        gamma: float = 0.99,
        # PER parameters
        alpha: float = 0.2,
        beta: float = 0.6,
        prior_eps: float = 1e-6,
        # Categorical DQN parameters
        v_min: float = 0.0,
        v_max: float = 200.0,
        atom_size: int = 51,
        # N-step Learning
        n_step: int = 3,
        # Convergence parameters
        convergence_window: int = 100,
        convergence_window_epsilon_p: int = 10, 
        convergence_avg_score: float = 195.0,
        convergence_avg_epsilon: float = 0.0524, # 3 degs converted to rads
        convergence_avg_epsilon_p: float = 0.0174, # 1 deg/s converted to rad/s
        # Tensorboard parameters
        model_name: str = "snake_joint",
    ):
        """
        Initialization.

        Args:
            env_client (GymEnvClient): ROS client to an openAI Gym environment server
            memory_size (int): length of memory
            batch_size (int): batch size for sampling
            target_update (int): period for target model's hard update
            lr (float): learning rate
            gamma (float): discount factor
            alpha (float): determines how much prioritization is used
            beta (float): determines how much importance sampling is used
            prior_eps (float): guarantees every transition can be sampled
            v_min (float): min value of support
            v_max (float): max value of support
            atom_size (int): the unit number of support
            n_step (int): step number to calculate n-step td error
        """
        obs_dim = env.observation_space.shape[0]
        action_dim = env.action_space.n

        self.env = env
        self.batch_size = batch_size
        self.target_update = target_update
        self.gamma = gamma

        # Selecting computing device
        physical_devices = tf.config.list_physical_devices('GPU') 
        n_gpu = len(physical_devices)
        rospy.loginfo("Number of GPU detected : " + str(n_gpu))
        if n_gpu > 0:
            rospy.loginfo("Switching to single GPU mode : /device:GPU:0")
            self.used_device = "/device:GPU:0"
            tf.config.experimental.set_memory_growth(physical_devices[0], True)
        else:
            rospy.loginfo("No GPU detected. Switching to single CPU mode : /device:CPU:0")
            self.used_device = "/device:CPU:0"

        # PER
        # memory for 1-step learning
        self.beta = beta
        self.prior_eps = prior_eps
        self.memory = PrioritizedReplayBuffer(
            memory_size,
            {
                "obs": {"shape": (obs_dim,)},
                "act": {"shape": (1,)},
                "rew": {},
                "next_obs": {"shape": (obs_dim,)},
                "done": {}
            },
            alpha=alpha    
        )

        # memory for N-step learning
        self.use_n_step = True if n_step > 1 else False
        if self.use_n_step:
            self.n_step = n_step
            self.memory_n = ReplayBuffer(
                memory_size,
                {
                    "obs": {"shape": (obs_dim,)},
                    "act": {"shape": (1,)},
                    "rew": {},
                    "next_obs": {"shape": (obs_dim,)},
                    "done": {}
                },
                Nstep={
                    "size": n_step,
                    "gamma": gamma,
                    "rew": "rew",
                    "next": "next_obs"
                }
            )

        # Categorical DQN parameters
        self.v_min = v_min
        self.v_max = v_max
        self.atom_size = atom_size
        self.support = tf.linspace(self.v_min, self.v_max, self.atom_size, name="support")

        # networks: dqn, dqn_target
        self.dqn = Network(
            obs_dim, action_dim, self.atom_size, self.support, name="dqn"
        )
        self.dqn_target = Network(
            obs_dim, action_dim, self.atom_size, self.support, name="dqn_target"
        )

        # optimizer
        self.optimizer = Adam(
            learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, name='AdamOptimizer'
        )

        # transition to store in memory
        self.transition = list()

        # mode: train / test
        self.is_test = False

        # Custom tensorboard object
        self.tensorboard = RainbowTensorBoard(
            log_dir="single_joint_logs/{}-{}".format(
                model_name,
                datetime.now().strftime("%m-%d-%Y-%H_%M_%S")
            )
        )
        # Convergence criterion
        self.convergence_window = convergence_window
        self.convergence_window_epsilon_p = convergence_window_epsilon_p
        self.convergence_avg_score = convergence_avg_score 
        self.convergence_avg_epsilon = convergence_avg_epsilon
        self.convergence_avg_epsilon_p = convergence_avg_epsilon_p

        #TODO 
        # model checkpoint object
        self.checkpoint = tf.train.Checkpoint(optimizer=self.optimizer, model=self.dqn_target)
        self.checkpoint_manager = tf.train.CheckpointManager(
            self.checkpoint, directory="single_joint_ckpts", max_to_keep=5
        )


    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input state."""
        # NoisyNet: no epsilon greedy action selection
        selected_action = tf.math.argmax(self.dqn(
            tf.constant(state.reshape(1, state.shape[0]), dtype=tf.float32)
        ), axis=-1, name="argmax_selected_action")
        
        # Convert to numpy ndarray datatype
        selected_action = selected_action.numpy()

        if not self.is_test:
            self.transition = [state, selected_action]
        
        return selected_action


    def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]:
        """
        Take an action and return the response of the env.
        """
        next_state, reward, done, _ = self.env.step(action,score)

        if not self.is_test:
            self.transition += [reward, next_state, done]

            # N-step transition
            if self.use_n_step:
                idx = self.memory_n.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], self.transition)
                    )
                )
                one_step_transition = [ v[idx] for _,v in self.memory_n.get_all_transitions().items()] if idx else None

            # 1-step transition
            else:
                one_step_transition = self.transition
            # add a single step transition
            if one_step_transition:
                self.memory.add(
                    **dict(
                        zip(["obs", "act", "rew", "next_obs", "done"], one_step_transition)
                    )
                )
        return next_state, reward, done


    def update_model(self) -> tf.Tensor:
        """
        Update the model by gradient descent
        """
        # PER needs beta to calculate weights
        samples = self.memory.sample(self.batch_size, beta=self.beta)
        weights = tf.constant(
            samples["weights"].reshape(-1, 1),
            dtype=tf.float32,
            name="update_model_weights"
        )
        indices = samples["indexes"]

        # 1-step Learning loss
        elementwise_loss = self._compute_dqn_loss(samples, self.gamma)


        with tf.GradientTape() as tape:
            # PER: importance of sampling before average
            loss = tf.math.reduce_mean(elementwise_loss * weights)

            # N-step Learning loss
            # We are going to combine 1-ste[ loss and n-step loss so as to
            # prevent high-variance.
            if self.use_n_step:
                gamma = self.gamma ** self.n_step
                samples = {k: [v[i] for i in indices] for k,v in self.memory_n.get_all_transitions().items()}
                elementwise_loss_n_loss = self._compute_dqn_loss(samples, gamma)
                elementwise_loss += elementwise_loss_n_loss

                # PER: importance of sampling before average
                loss = tf.math.reduce_mean(elementwise_loss * weights)
        
        dqn_variables = self.dqn.trainable_variables
        gradients = tape.gradient(loss, dqn_variables)
        gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
        self.optimizer.apply_gradients(zip(gradients, dqn_variables))

        # PER: update priorities
        loss_for_prior = elementwise_loss.numpy()
        new_priorities = loss_for_prior + self.prior_eps
        self.memory.update_priorities(indices, new_priorities)

        # NoisyNet: reset noise
        self.dqn.reset_noise()
        self.dqn_target.reset_noise()

        return loss.numpy().ravel()


    def train(self, num_frames: int):
        """Train the agent."""
        self.is_test = False

        state = self.env.reset()
        update_cnt = 0
        scores = deque(maxlen=self.convergence_window)
        joint_epsilon = deque(maxlen=self.convergence_window)
        joint_epsilon_p = deque(maxlen=self.convergence_window_epsilon_p)
        score = 0 # cumulated reward
        episode_length = 0
        episode_cnt = 0

        for frame_idx in tqdm(range(1, num_frames + 1), file=tqdm_out):
            action = self.select_action(state)
            next_state, reward, done = self.step(action)
            state = next_state
            score += reward
            episode_length += 1

            # PER: increase beta
            fraction = min(frame_idx / num_frames, 1.0)
            self.beta = self.beta + fraction * (1.0 - self.beta)

            print("epsilon_p is {}".format(state[7]))
            print("epsilon is {}".format(state[6]))

            if done:
                print("done")
                # to be used for convergence criterion
                scores.append(score) 
                joint_epsilon.append(state[6])
                joint_epsilon_p.append(state[7])
                #

                state = self.env.reset()
                self.tensorboard.update_stats(
                    score={
                        "data": score,
                        "desc": "Score (or cumulated rewards) for an episode - episode index on x-axis."
                    },
                    episode_length={
                        "data": episode_length,
                        "desc": "Episode length (in frames)"
                    },
                    final_epsilon={
                        "data": state[6],
                        "desc": "Value of epsilon = abs(theta_ld - theta_l) at the last frame of an episode"
                    },
                    final_epsilon_p={
                        "data": state[7],
                        "desc": "Value of d(epsilon)/dt at the last frame of an episode"
                    }
                )
                score = 0
                episode_length = 0
                episode_cnt += 1

                # check convergence criterion
                converged = bool(
                    len(scores) == self.convergence_window and # be sure the score buffer is full
                    len(joint_epsilon) == self.convergence_window and # same for epsilon buffer
                    len(joint_epsilon_p) == self.convergence_window and # same for epsilon_p buffer
                    mean(scores) > self.convergence_avg_score and 
                    mean(joint_epsilon) < self.convergence_avg_epsilon and
                    mean(joint_epsilon_p) < self.convergence_avg_epsilon_p
                )
                if converged:
                    rospy.loginfo("Ran {} episodes. Solved after {} trials".format(episode_cnt, frame_idx))
                    return

            #  if training is ready
            if self.memory.get_stored_size() >= self.batch_size:
                loss = self.update_model()
                # plotting loss every frame
                self.tensorboard.update_stats(
                    loss={
                        "data": loss[0],
                        "desc": "Loss value."
                    }
                )
                update_cnt += 1
                # if hard update is needed
                if update_cnt % self.target_update == 0:
                    self._target_hard_update()
                    # checkpointing of target model (only if the loss decrease)
                    self.checkpoint_manager.save()


        self.env.close()


    def test(self) -> List[np.ndarray]:
        """Test the agent."""
        self.is_test = True
        
        state = self.env.reset()
        done = False
        score = 0
        
        frames = []
        while not done:
            frames.append(self.env.render(mode="rgb_array"))
            action = self.select_action(state)
            next_state, reward, done = self.step(action)

            state = next_state
            score += reward
        
        rospy.loginfo("score: ", score)
        self.env.close()
        
        return frames


    def _compute_dqn_loss(self, samples: Dict[str, np.ndarray], gamma: float) -> tf.Tensor:
        with tf.device(self.used_device):
            state = tf.constant(samples["obs"], dtype=tf.float32)
            next_state = tf.constant(samples["next_obs"], dtype=tf.float32)
            action = tf.constant(samples["act"], dtype=tf.float32)
            reward = tf.reshape(tf.constant(samples["rew"], dtype=tf.float32), [-1, 1])
            done = tf.reshape(tf.constant(samples["done"], dtype=tf.float32), [-1, 1])

            # Categorical DQN algorithm
            delta_z = float(self.v_max - self.v_min) / (self.atom_size - 1)

            # Double DQN
            next_action = tf.math.argmax(self.dqn(next_state), axis=1)
            next_dist = self.dqn_target.dist(next_state)
            next_dist = tf.gather_nd(
                next_dist,
                [[i, next_action.numpy()[0]] for i in range(self.batch_size)]
            )

            t_z = reward + (1 - done) * gamma * self.support
            t_z = tf.clip_by_value(t_z, clip_value_min=self.v_min, clip_value_max=self.v_max)
            b = tf.dtypes.cast((t_z - self.v_min) / delta_z, tf.float64)
            l = tf.dtypes.cast(tf.math.floor(b), tf.float64)
            u = tf.dtypes.cast(tf.math.ceil(b), tf.float64)

            offset = (
                tf.broadcast_to(
                    tf.expand_dims(
                        tf.dtypes.cast(
                            tf.linspace(0, (self.batch_size - 1) * self.atom_size, self.batch_size),
                            tf.float64
                        ),
                        axis=1
                    ),
                    [self.batch_size, self.atom_size]
                )
            )

            proj_dist = tf.zeros(tf.shape(next_dist), tf.float64)
            # casting
            next_dist = tf.dtypes.cast(next_dist, tf.float64)

            proj_dist = tf.tensor_scatter_nd_add(
                tf.reshape(proj_dist, [-1]), # input tensor
                tf.reshape(tf.dtypes.cast(l + offset, tf.int64), [-1, 1]), # indices
                tf.reshape((next_dist * (u - b)), [-1]) # updates
            )

            proj_dist = tf.tensor_scatter_nd_add(
                proj_dist,
                tf.reshape(tf.dtypes.cast(u + offset, tf.int64), [-1, 1]), # indices
                tf.reshape((next_dist * (b - l)), [-1]) # updates
            )
            proj_dist = tf.reshape(proj_dist, [self.batch_size, self.atom_size])

        dist = self.dqn.dist(state)
        #log_p = tf.math.log(dist[range(self.batch_size), action])
        log_p = tf.dtypes.cast(
            tf.math.log(
                tf.gather_nd(
                    dist,
                    [[i, tf.dtypes.cast(tf.reshape(action, [-1]), tf.int32).numpy()[i]] for i in range(self.batch_size)]
                )
            ),
            tf.float64
        )
        elementwise_loss = tf.math.reduce_sum(-(proj_dist * log_p), axis=1)

        return tf.dtypes.cast(elementwise_loss, tf.float32)


    def _target_hard_update(self):
        """Hard update: target <- local."""
        tf.saved_model.save(self.dqn, "single_joint_dqn")
        self.dqn_target = tf.saved_model.load("single_joint_dqn")