コード例 #1
0
ファイル: features.py プロジェクト: ymd-h/cpprb
    def test_memmap(self):
        rb = ReplayBuffer(32, {"done": {}}, mmap_prefix="mmap")

        for _ in range(1000):
            rb.add(done=0.0)

        self.assertTrue(os.path.exists("mmap_done.dat"))
コード例 #2
0
    def test(self):
        buffer_size = 256
        obs_dim = 3
        act_dim = 1
        rb = ReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {},
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })

        obs = np.ones(shape=(obs_dim))
        act = np.ones(shape=(act_dim))
        rew = 0
        next_obs = np.ones(shape=(obs_dim))
        done = 0

        for i in range(500):
            rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done)

        batch_size = 32
        sample = rb.sample(batch_size)
コード例 #3
0
def explorer(global_rb,env_dict,is_training_done,queue):
    local_buffer_size = int(1e+2)
    local_rb = ReplayBuffer(local_buffer_size,env_dict)

    model = MyModel()
    env = gym.make("CartPole-v1")

    obs = env.reset()
    while not is_training_done.is_set():
        if not queue.empty():
            w = queue.get()
            model.weights = w

        action = model.get_action(obs)
        next_obs, reward, done, _ = env.step(action)
        local_rb.add(obs=obs,act=action,rew=reward,next_obs=next_obs,done=done)

        if done:
            local_rb.on_episode_end()
            obs = env.reset()
        else:
            obs = next_obs

        if local_rb.get_stored_size() == local_buffer_size:
            local_sample = local_rb.get_all_transitions()
            local_rb.clear()

            absTD = model.abs_TD_error(local_sample)
            global_rb.add(**local_sample,priorities=absTD)
コード例 #4
0
    def test_train(self):
        agent = DQN(
            state_shape=self.env.observation_space.shape,
            action_dim=self.env.action_space.n,
            memory_capacity=100,
            gpu=-1)
        from cpprb import ReplayBuffer
        replay_buffer = ReplayBuffer(
            obs_dim=self.env.observation_space.shape,
            act_dim=1,
            size=agent.memory_capacity)

        obs = self.env.reset()
        for _ in range(100):
            action = agent.get_action(obs)
            next_obs, reward, done, _ = self.env.step(action)
            replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done)
            if done:
                next_obs = self.env.reset()
            obs = next_obs

        for _ in range(100):
            samples = replay_buffer.sample(agent.batch_size)
            agent.train(samples["obs"], samples["act"], samples["next_obs"],
                        samples["rew"], np.array(samples["done"], dtype=np.float64))
コード例 #5
0
class buffer_class:
    def __init__(self, max_length, seed_number, env):
        env_dict = create_env_dict(env)

        #override the observation length in the replay memory
        env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )}
        env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )}
        print('!!!!', env_dict['obs'])
        self.before_add = create_before_add_func(env)
        self.storage = ReplayBuffer(max_length, env_dict)

    def append(self, s, a, r, done, sp):
        self.storage.add(
            **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp))

    def sample(self, batch_size):
        batch = self.storage.sample(batch_size)
        s_matrix = batch['obs']
        a_matrix = batch['act']
        r_matrix = batch['rew']
        done_matrix = batch['done']
        sp_matrix = batch['next_obs']
        return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix

    def __len__(self):
        return self.storage.get_stored_size()
コード例 #6
0
class Agent:
    def __init__(self, learn_rate, state_shape, num_actions, batch_size):
        self.mem_size=100000
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(
            self.mem_size, 
            {   "obs":      { "shape": state_shape  },
                "act":      { "shape": 1            },
                "rew":      {                       },
                "next_obs": { "shape": state_shape  },
                "done":     { "shape": 1            }})

        self.net = Network(learn_rate, state_shape, num_actions)

    def choose_action(self, observation):
        state = torch.tensor(observation).float().detach()
        state = state.to(self.net.device)
        state = state.unsqueeze(0)

        q_values = self.net(state)
        action = torch.argmax(q_values).item()
        return action

    def store_memory(self, state, action, reward, next_state, done):
        self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done)  

    def learn(self):
        if self.memory.get_stored_size() < self.batch_size:
            return
    
        batch = self.memory.sample(self.batch_size)
            
        states  = torch.tensor( batch["obs"]                     ).to(self.net.device)
        actions = torch.tensor( batch["act"],   dtype=torch.int64).to(self.net.device).T[0]
        rewards = torch.tensor( batch["rew"]                     ).to(self.net.device).T[0]
        states_ = torch.tensor( batch["next_obs"]                ).to(self.net.device)
        dones   = torch.tensor( batch["done"],  dtype=torch.bool ).to(self.net.device).T[0]

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        q_values  =   self.net(states)[batch_index, actions]
        q_values_ =   self.net(states_)

        action_qs_ = torch.max(q_values_, dim=1)[0]
        action_qs_[dones] = 0.0
        q_target = rewards + self.gamma * action_qs_

        td = q_target - q_values

        self.net.optimizer.zero_grad()
        loss = (td ** 2.0).mean()
        loss.backward()
        self.net.optimizer.step()

        self.net.reset_noise()
コード例 #7
0
    def __init__(self, *args, n_eval_episodes_per_model=5, **kwargs):
        kwargs["n_dynamics_model"] = 5
        super().__init__(*args, **kwargs)
        self._n_eval_episodes_per_model = n_eval_episodes_per_model

        # Replay buffer to train policy
        self.replay_buffer = get_replay_buffer(self._policy, self._env)

        # Replay buffer to compute GAE
        rb_dict = {
            "size": self._episode_max_steps,
            "default_dtype": np.float32,
            "env_dict": {
                "obs": {
                    "shape": self._env.observation_space.shape
                },
                "act": {
                    "shape": self._env.action_space.shape
                },
                "next_obs": {
                    "shape": self._env.observation_space.shape
                },
                "rew": {},
                "done": {},
                "logp": {},
                "val": {}
            }
        }
        self.local_buffer = ReplayBuffer(**rb_dict)
コード例 #8
0
    def test_with_one(self):
        buffer_size = 32
        obs_shape = 3
        act_shape = 4

        rb = ReplayBuffer(buffer_size, {
            "obs": {
                "shape": obs_shape
            },
            "act": {
                "shape": act_shape
            },
            "done": {}
        })

        v = {
            "obs": np.ones(shape=obs_shape),
            "act": np.zeros(shape=act_shape),
            "done": 0
        }

        rb.add(**v)

        tx = rb.get_all_transitions()

        for key in ["obs", "act", "done"]:
            with self.subTest(key=key):
                np.testing.assert_allclose(tx[key],
                                           np.asarray(v[key]).reshape((1, -1)))
コード例 #9
0
    def __call__(self):
        total_steps = 0
        n_episode = 0

        # TODO: clean codes
        # Prepare buffer
        self.replay_buffer = get_replay_buffer(self._policy, self._env)
        kwargs_local_buf = get_default_rb_dict(size=self._episode_max_steps,
                                               env=self._env)
        kwargs_local_buf["env_dict"]["logp"] = {}
        kwargs_local_buf["env_dict"]["val"] = {}
        if is_discrete(self._env.action_space):
            kwargs_local_buf["env_dict"]["act"]["dtype"] = np.int32
        self.local_buffer = ReplayBuffer(**kwargs_local_buf)

        tf.summary.experimental.set_step(total_steps)
        while total_steps < self._max_steps:
            # Collect samples
            n_episode, total_rewards = self._collect_sample(
                n_episode, total_steps)
            total_steps += self._policy.horizon
            tf.summary.experimental.set_step(total_steps)

            if len(total_rewards) > 0:
                avg_training_return = sum(total_rewards) / len(total_rewards)
                tf.summary.scalar(name="Common/training_return",
                                  data=avg_training_return)

            # Train actor critic
            for _ in range(self._policy.n_epoch):
                samples = self.replay_buffer.sample(self._policy.horizon)
                if self._policy.normalize_adv:
                    adv = (samples["adv"] - np.mean(samples["adv"])) / np.std(
                        samples["adv"])
                else:
                    adv = samples["adv"]
                for idx in range(
                        int(self._policy.horizon / self._policy.batch_size)):
                    target = slice(idx * self._policy.batch_size,
                                   (idx + 1) * self._policy.batch_size)
                    self._policy.train(states=samples["obs"][target],
                                       actions=samples["act"][target],
                                       advantages=adv[target],
                                       logp_olds=samples["logp"][target],
                                       returns=samples["ret"][target])

            if total_steps % self._test_interval == 0:
                avg_test_return = self.evaluate_policy(total_steps)
                self.logger.info(
                    "Evaluation Total Steps: {0: 7} Average Reward {1: 5.4f} over {2: 2} episodes"
                    .format(total_steps, avg_test_return, self._test_episodes))
                tf.summary.scalar(name="Common/average_test_return",
                                  data=avg_test_return)
                self.writer.flush()

            if total_steps % self._model_save_interval == 0:
                self.checkpoint_manager.save()

        tf.summary.flush()
コード例 #10
0
    def test_nstep(self):
        rb = ReplayBuffer(32,{'rew': {}, 'done': {}},
                          Nstep={"size": 4, "rew": "rew"})

        self.assertIs(rb.add(rew=1,done=0),None)
        self.assertIs(rb.add(rew=1,done=0),None)
        self.assertIs(rb.add(rew=1,done=0),None)
        self.assertEqual(rb.add(rew=1,done=0),0)
コード例 #11
0
    def test_multistep_add(self):
        rb = ReplayBuffer(4, {"done": {}})

        done = jnp.asarray([1,1,1])

        for i in range(2):
            with self.subTest(i=i):
                rb.add(done=done)
コード例 #12
0
    def test_add(self):
        rb = ReplayBuffer(4, {"done": {}})

        done = jnp.asarray(1)

        for i in range(5):
            with self.subTest(i=i):
                rb.add(done=done)
コード例 #13
0
ファイル: issue.py プロジェクト: ymd-h/cpprb
    def test_python_type(self):
        types = [bool, int, float]

        for d in types:
            with self.subTest(type=d):
                b = ReplayBuffer(10, {"a": {"dtype": d}})
                b.add(a=d(1))
                self.assertEqual(b.get_all_transitions()["a"].dtype, d)
コード例 #14
0
    def set_replay_buffer(self, env, get_from_file):

        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape

        if get_from_file:
            print(colorize("Pulling saved expert %s trajectories from file over %d episodes" %
                           (self.config_name, self.expert_episodes), 'blue', bold=True))

            f = open(self._demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb")
            buffer_file = pickle.load(f)
            f.close()

            data = samples_from_cpprb(npsamples=buffer_file)

            # Reconstruct the data, then pass it to replay buffer
            np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data)

            # Create environment
            before_add = create_before_add_func(env)

            replay_buffer = ReplayBuffer(size= self.replay_buffer_size,
                                         env_dict={
                                             "obs": {"shape": obs_dim},
                                             "act": {"shape": act_dim},
                                             "rew": {},
                                             "next_obs": {"shape": obs_dim},
                                             "done": {}})

            replay_buffer.add(**before_add(obs=np_states[~np_dones],
                                           act=np_actions[~np_dones],
                                           rew=np_rewards[~np_dones],
                                           next_obs=np_next_states[~np_dones],
                                           done=np_next_dones[~np_dones]))
            self.replay_buffer = replay_buffer

        else:
            # Generate expert data
            print(colorize(
                "Generating expert %s trajectories from file over %d episodes" % (self.config_name, self.expert_episodes),
                'blue', bold=True))

            # Load trained policy
            _, get_action = load_policy_and_env(osp.join(self._root_data_path, self.file_name, self.file_name + '_s0/'),
                                                'last', False)
            expert_rb = run_policy(env,
                                   get_action,
                                   0,
                                   self.expert_episodes,
                                   False,
                                   record=not get_from_file,
                                   record_name='expert_' + self.file_name + '_' + str(self.expert_episodes) + '_runs',
                                   record_project='clone_benchmarking_' + self.config_name,
                                   data_path= self._expert_path,
                                   config_name= self.config_name,
                                   max_len_rb=self.replay_buffer_size)

            self.replay_buffer = expert_rb
コード例 #15
0
ファイル: get_replay_buffer.py プロジェクト: zhangtning/tf2rl
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs["size"] = size

    # on-policy policy
    if not issubclass(type(policy), OffPolicyAgent):
        kwargs["size"] = policy.horizon
        kwargs["env_dict"].pop("next_obs")
        kwargs["env_dict"].pop("rew")
        # TODO: Remove done. Currently cannot remove because of cpprb implementation
        # kwargs["env_dict"].pop("done")
        kwargs["env_dict"]["logp"] = {}
        kwargs["env_dict"]["ret"] = {}
        kwargs["env_dict"]["adv"] = {}
        if is_discrete(env.action_space):
            kwargs["env_dict"]["act"]["dtype"] = np.int32
        return ReplayBuffer(**kwargs)

    # N-step prioritized
    if use_prioritized_rb and use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return PrioritizedReplayBuffer(**kwargs)

    if len(obs_shape) == 3:
        kwargs["env_dict"]["obs"]["dtype"] = np.ubyte
        kwargs["env_dict"]["next_obs"]["dtype"] = np.ubyte

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["Nstep"] = {
            "size": n_step,
            "gamma": policy.discount,
            "rew": "rew",
            "next": "next_obs"
        }
        return ReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
コード例 #16
0
    def __init__(self, max_length, seed_number, env):
        env_dict = create_env_dict(env)

        #override the observation length in the replay memory
        env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )}
        env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )}
        print('!!!!', env_dict['obs'])
        self.before_add = create_before_add_func(env)
        self.storage = ReplayBuffer(max_length, env_dict)
コード例 #17
0
    def tet_nstep_multistep_add(self):
        rb = ReplayBuffer(6, {"obs": {}, "rew": {}, "done": {}, "next_obs":{}},
                          Nstep={"size": 4, "rew": "rew", "next": "next_obs"})

        obs = jnp.asarray([1,1,1,1])
        rew = jnp.asarray([1,1,1,1])
        done = jnp.asarray([1,1,1,1])
        next_obs = jnp.asarray([1,1,1,1])

        for i in range(7):
            with self.subTest(i=i):
                rb.add(obs=obs, rew=rew, done=done, next_obs=next_obs)
コード例 #18
0
    def set_multiple_replay_buffers(self, env):
        print(self.config_name_list)

        obs_dim = env.observation_space.shape
        act_dim = env.action_space.shape

        print(colorize("Pulling saved trajectories from two experts ( %s and %s) from files over %d episodes" %
                       (self.config_name_list[0], self.config_name_list[1], self.expert_episodes), 'blue', bold=True))

        rb_list = []

        v = 0
        for x in self.config_name_list:

            _expert_demo_dir = os.path.join(self._expert_path, x + '_episodes/')

            f = open(_expert_demo_dir + 'sim_data_' + str(self.expert_episodes) + '_buffer.pkl', "rb")
            buffer_file = pickle.load(f)
            f.close()

            data = samples_from_cpprb(npsamples=buffer_file)

            # Reconstruct the data, then pass it to replay buffer
            np_states, np_rewards, np_actions, np_next_states, np_dones, np_next_dones = samples_to_np(data)

            # Create environment
            before_add = create_before_add_func(env)

            replay_buffer = ReplayBuffer(size=self.replay_buffer_size,
                                         env_dict={
                                             "obs": {"shape": tuple([obs_dim[0]+2,])},
                                             "act": {"shape": act_dim},
                                             "rew": {},
                                             "next_obs": {"shape": tuple([obs_dim[0]+2,])},
                                             "done": {}})



            # Concatenate the states with one hot vectors depending on class
            extend1 = [one_hot(np.array([v]), self.n_experts)] * np_states[~np_dones].shape[0]

            appended_states = np.append(np_states[~np_dones], np.c_[extend1], 1)
            appended_next_states = np.append(np_next_states[~np_dones], np.c_[extend1], 1)

            replay_buffer.add(**before_add(obs=appended_states,
                                           act=np_actions[~np_dones],
                                           rew=np_rewards[~np_dones],
                                           next_obs=appended_next_states,
                                           done=np_next_dones[~np_dones]))

            rb_list.append(replay_buffer)
            v += 1
        self.rb_list = rb_list
コード例 #19
0
def get_replay_buffer(policy, env, size=None):
    if policy is None or env is None:
        return None

    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    return ReplayBuffer(**kwargs)
コード例 #20
0
ファイル: get_replay_buffer.py プロジェクト: keiohta/torchrl
def get_replay_buffer(policy,
                      env,
                      use_prioritized_rb=False,
                      use_nstep_rb=False,
                      n_step=1,
                      size=None):
    if policy is None or env is None:
        return None

    obs_shape = get_space_size(env.observation_space)
    kwargs = get_default_rb_dict(policy.memory_capacity, env)

    if size is not None:
        kwargs['size'] = size

    # TODO(sff1019): Add on-policy behaviour
    # TODO(sff1019): Add N-step prioritized

    if len(obs_shape) == 3:
        kwargs['env_dict']['obs']['dtype'] = np.ubyte
        kwargs['env_dict']['next_obs']['dtype'] = np.ubtye

    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    return ReplayBuffer(**kwargs)
コード例 #21
0
    def test_buffer(self):

        buffer_size = 256
        obs_shape = (15,15)
        act_dim = 5

        N = 512

        erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape},
                                        "act":{"shape": act_dim},
                                        "rew":{},
                                        "next_obs":{"shape": obs_shape},
                                        "done":{}})

        for i in range(N):
            obs = np.full(obs_shape,i,dtype=np.double)
            act = np.full(act_dim,i,dtype=np.double)
            rew = i
            next_obs = obs + 1
            done = 0

            erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done)

        erb._encode_sample(range(buffer_size))

        erb.sample(32)

        erb.clear()

        self.assertEqual(erb.get_next_index(),0)
        self.assertEqual(erb.get_stored_size(),0)
コード例 #22
0
    def test_update_count(self):
        """
        Check step and episode

        step < max_steps
        episode <= step
        """
        rb = ReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })

        def update(kw, step, episode):
            self.assertLess(step, 10)
            self.assertLessEqual(episode, step)
            return 0.5

        train(rb,
              self.env,
              lambda obs, step, episode, is_warmup: 1.0,
              update,
              max_steps=10)
コード例 #23
0
    def test_too_big_max_steps(self):
        """
        Raise ValueError for too big max_steps
        """
        rb = ReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })

        def update(kw, step, episode):
            raise RuntimeError

        with self.assertRaises(ValueError):
            train(rb,
                  self.env,
                  lambda obs, step, episode, is_warmup: 1.0,
                  update,
                  max_steps=int(1e+32))
コード例 #24
0
    def test_episode_callback(self):
        """
        Pass custom episode_callback
        """
        rb = ReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })

        def callback(episode, episode_step, episode_reward):
            self.assertEqual(episode_step, int(episode_reward))

        train(rb,
              self.env,
              lambda obs, step, episode, is_warmup: 1.0,
              lambda tr, step, episode: 0.5,
              max_steps=10,
              rew_sum=lambda sum, tr: sum + 1.0,
              done_check=lambda tr: True)
コード例 #25
0
    def test_done_check(self):
        """
        Pass custom check_done which always return `True`

        Always step == episode
        """
        rb = ReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })

        def update(kw, step, episode):
            self.assertLess(step, 10)
            self.assertEqual(step, episode)
            return 0.5

        train(rb,
              self.env,
              lambda obs, step, episode, is_warmup: 1.0,
              update,
              max_steps=10,
              done_check=lambda kw: True)
コード例 #26
0
    def test_warmup(self):
        """
        Skip warmup steps

        n_warmups <= step
        """
        rb = ReplayBuffer(
            32, {
                "obs": {
                    "shape": (3, )
                },
                "act": {},
                "rew": {},
                "next_obs": {
                    "shape": (3, )
                },
                "done": {}
            })

        def update(kw, step, episode):
            self.assertGreaterEqual(step, 5)
            self.assertLess(step, 10)
            self.assertLessEqual(episode, step)
            return 0.5

        train(rb,
              self.env,
              lambda obs, step, episode, is_warmup: 1.0,
              update,
              max_steps=10,
              n_warmups=5)
コード例 #27
0
ファイル: issue.py プロジェクト: ymd-h/cpprb
    def test_dtype_check(self):
        types = [
            np.bool_, np.bool8, np.byte, np.short, np.intc, np.int_,
            np.longlong, np.intp, np.int8, np.int16, np.int32, np.int64,
            np.ubyte, np.ushort, np.uintc, np.uint, np.ulonglong, np.uintp,
            np.uint8, np.uint16, np.uint32, np.uint64, np.half, np.single,
            np.double, np.float_, np.longfloat, np.float16, np.float32,
            np.float64, np.csingle, np.complex_, np.clongfloat, np.complex64,
            np.complex128
        ]

        for d in types:
            with self.subTest(type=d):
                b = ReplayBuffer(10, {"a": {"dtype": d}})
                b.add(a=np.ones(1, dtype=d))
                self.assertEqual(b.get_all_transitions()["a"].dtype, d)
コード例 #28
0
    def __init__(self, learn_rate, state_shape, num_actions, batch_size):
        self.mem_size=100000
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(
            self.mem_size, 
            {   "obs":      { "shape": state_shape  },
                "act":      { "shape": 1            },
                "rew":      {                       },
                "next_obs": { "shape": state_shape  },
                "done":     { "shape": 1            }})

        self.net = Network(learn_rate, state_shape, num_actions)
コード例 #29
0
    def test_ReplayBuffer_with_single_step(self):
        buffer_size = 256
        obs_shape = (3, 4)
        batch_size = 10

        rb = ReplayBuffer(buffer_size, {"obs": {"shape": obs_shape}})

        v = {"obs": np.ones(shape=obs_shape)}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
コード例 #30
0
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=1000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size
        self.target_update_interval = 200
        self.step_count = 0

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(
            max_mem_size, {
                "obs": {
                    "shape": state_shape
                },
                "act": {
                    "shape": 1
                },
                "rew": {},
                "next_obs": {
                    "shape": state_shape
                },
                "done": {
                    "shape": 1
                }
            })

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        # self.device = torch.device("cpu")

        self.V_MIN, self.V_MAX = 0, 200
        self.NUM_ATOMS = 4
        self.support = torch.linspace(self.V_MIN, self.V_MAX,
                                      self.NUM_ATOMS).to(self.device)
        self.net = Network(lr, state_shape, num_actions, self.support,
                           self.NUM_ATOMS).to(self.device)
        self.net_ = Network(lr, state_shape, num_actions, self.support,
                            self.NUM_ATOMS).to(self.device)

        self.net_.load_state_dict(self.net.state_dict())