Exemple #1
0
    def test_buffer(self):

        buffer_size = 256
        obs_shape = (15,15)
        act_dim = 5

        N = 512

        erb = ReplayBuffer(buffer_size,{"obs":{"shape": obs_shape},
                                        "act":{"shape": act_dim},
                                        "rew":{},
                                        "next_obs":{"shape": obs_shape},
                                        "done":{}})

        for i in range(N):
            obs = np.full(obs_shape,i,dtype=np.double)
            act = np.full(act_dim,i,dtype=np.double)
            rew = i
            next_obs = obs + 1
            done = 0

            erb.add(obs=obs,act=act,rew=rew,next_obs=next_obs,done=done)

        es = erb._encode_sample(range(buffer_size))

        erb.sample(32)

        erb.clear()

        self.assertEqual(erb.get_next_index(),0)
        self.assertEqual(erb.get_stored_size(),0)
Exemple #2
0
    def test(self):
        buffer_size = 256
        obs_dim = 3
        act_dim = 1
        rb = ReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {},
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })

        obs = np.ones(shape=(obs_dim))
        act = np.ones(shape=(act_dim))
        rew = 0
        next_obs = np.ones(shape=(obs_dim))
        done = 0

        for i in range(500):
            rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done)

        batch_size = 32
        sample = rb.sample(batch_size)
class Agent:
    def __init__(self, learn_rate, state_shape, num_actions, batch_size):
        self.mem_size=100000
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(
            self.mem_size, 
            {   "obs":      { "shape": state_shape  },
                "act":      { "shape": 1            },
                "rew":      {                       },
                "next_obs": { "shape": state_shape  },
                "done":     { "shape": 1            }})

        self.net = Network(learn_rate, state_shape, num_actions)

    def choose_action(self, observation):
        state = torch.tensor(observation).float().detach()
        state = state.to(self.net.device)
        state = state.unsqueeze(0)

        q_values = self.net(state)
        action = torch.argmax(q_values).item()
        return action

    def store_memory(self, state, action, reward, next_state, done):
        self.memory.add(obs=state, act=action, rew=reward, next_obs=next_state, done=done)  

    def learn(self):
        if self.memory.get_stored_size() < self.batch_size:
            return
    
        batch = self.memory.sample(self.batch_size)
            
        states  = torch.tensor( batch["obs"]                     ).to(self.net.device)
        actions = torch.tensor( batch["act"],   dtype=torch.int64).to(self.net.device).T[0]
        rewards = torch.tensor( batch["rew"]                     ).to(self.net.device).T[0]
        states_ = torch.tensor( batch["next_obs"]                ).to(self.net.device)
        dones   = torch.tensor( batch["done"],  dtype=torch.bool ).to(self.net.device).T[0]

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        q_values  =   self.net(states)[batch_index, actions]
        q_values_ =   self.net(states_)

        action_qs_ = torch.max(q_values_, dim=1)[0]
        action_qs_[dones] = 0.0
        q_target = rewards + self.gamma * action_qs_

        td = q_target - q_values

        self.net.optimizer.zero_grad()
        loss = (td ** 2.0).mean()
        loss.backward()
        self.net.optimizer.step()

        self.net.reset_noise()
Exemple #4
0
    def test_train(self):
        agent = DQN(
            state_shape=self.env.observation_space.shape,
            action_dim=self.env.action_space.n,
            memory_capacity=100,
            gpu=-1)
        from cpprb import ReplayBuffer
        replay_buffer = ReplayBuffer(
            obs_dim=self.env.observation_space.shape,
            act_dim=1,
            size=agent.memory_capacity)

        obs = self.env.reset()
        for _ in range(100):
            action = agent.get_action(obs)
            next_obs, reward, done, _ = self.env.step(action)
            replay_buffer.add(obs=obs, act=action, next_obs=next_obs, rew=reward, done=done)
            if done:
                next_obs = self.env.reset()
            obs = next_obs

        for _ in range(100):
            samples = replay_buffer.sample(agent.batch_size)
            agent.train(samples["obs"], samples["act"], samples["next_obs"],
                        samples["rew"], np.array(samples["done"], dtype=np.float64))
class buffer_class:
    def __init__(self, max_length, seed_number, env):
        env_dict = create_env_dict(env)

        #override the observation length in the replay memory
        env_dict['obs'] = {"dtype": numpy.float32, "shape": (17, )}
        env_dict['next_obs'] = {"dtype": numpy.float32, "shape": (17, )}
        print('!!!!', env_dict['obs'])
        self.before_add = create_before_add_func(env)
        self.storage = ReplayBuffer(max_length, env_dict)

    def append(self, s, a, r, done, sp):
        self.storage.add(
            **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp))

    def sample(self, batch_size):
        batch = self.storage.sample(batch_size)
        s_matrix = batch['obs']
        a_matrix = batch['act']
        r_matrix = batch['rew']
        done_matrix = batch['done']
        sp_matrix = batch['next_obs']
        return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix

    def __len__(self):
        return self.storage.get_stored_size()
Exemple #6
0
class ReplayBuffer:
    def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"):
        super().__init__()
        self.done_string = done_string
        self.min_storage = min_storage
        cpprb_args = {
            "size": size,
            "env_dict": env_dict,
            "Nstep": n_step_dict
        }
        self.buffer = CPPRB(**cpprb_args)

    def add(self, data: Sequence[Dict[str, np.ndarray]]) -> None:
        for d in data:
            self.buffer.add(**d)
            if d[self.done_string]:
                self.buffer.on_episode_end()

    def sample(self, size: int) -> Dict[str, np.ndarray]:
        if self.buffer.get_stored_size() < self.min_storage:
            print(
                f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage" +
                f"size {self.min_storage}. Returning None."
            )
            return None
        else:
            return self.buffer.sample(size)
Exemple #7
0
    def test_ReplayBuffer_with_single_step(self):
        buffer_size = 256
        obs_shape = (3, 4)
        batch_size = 10

        rb = ReplayBuffer(buffer_size, {"obs": {"shape": obs_shape}})

        v = {"obs": np.ones(shape=obs_shape)}

        rb.add(**v)

        rb.sample(batch_size)

        for _ in range(100):
            rb.add(**v)

        rb.sample(batch_size)
Exemple #8
0
    def test_next_obs(self):
        buffer_size = 256
        obs_shape = (15, 15)
        act_dim = 5

        rb = ReplayBuffer(buffer_size, {
            "obs": {
                "shape": obs_shape,
                "dtype": np.ubyte
            },
            "act": {
                "shape": act_dim
            },
            "rew": {},
            "done": {}
        },
                          next_of="obs")

        self.assertEqual(rb.get_next_index(), 0)
        self.assertEqual(rb.get_stored_size(), 0)

        obs = np.zeros(obs_shape, dtype=np.ubyte)
        act = np.ones(act_dim)
        rew = 1
        done = 0

        rb.add(obs=obs, act=act, rew=rew, next_obs=obs, done=done)

        self.assertEqual(rb.get_next_index(), 1)
        self.assertEqual(rb.get_stored_size(), 1)

        with self.assertRaises(KeyError):
            rb.add(obs=obs)

        self.assertEqual(rb.get_next_index(), 1)
        self.assertEqual(rb.get_stored_size(), 1)

        next_obs = rb.sample(32)["next_obs"]

        for i in range(512):
            obs = np.ones(obs_shape, dtype=np.ubyte) * i
            rb.add(obs=obs, act=act, rew=rew, next_obs=obs + 1, done=done)

        sample = rb._encode_sample(range(buffer_size))

        ith = rb.get_next_index()
        np.testing.assert_allclose(
            np.roll(sample["obs"], -ith - 1, axis=0)[1:],
            np.roll(sample["next_obs"], -ith - 1, axis=0)[:-1])
Exemple #9
0
class buffer_class:
    def __init__(self, max_length, seed_number, env):
        env_dict = create_env_dict(env)
        self.before_add = create_before_add_func(env)
        self.storage = ReplayBuffer(max_length, env_dict)

    def append(self, s, a, r, done, sp):
        self.storage.add(
            **self.before_add(obs=s, act=a, rew=r, done=done, next_obs=sp))

    def sample(self, batch_size):
        batch = self.storage.sample(batch_size)
        s_matrix = batch['obs']
        a_matrix = batch['act']
        r_matrix = batch['rew']
        done_matrix = batch['done']
        sp_matrix = batch['next_obs']
        return s_matrix, a_matrix, r_matrix, done_matrix, sp_matrix

    def __len__(self):
        return self.storage.get_stored_size()
Exemple #10
0
def main():
    s_dim = 4
    a_dim = 2
    batch_size = 64

    env = "../envs/point_mass2d.xml"
    sim = Simulation(env, s_dim, a_dim, None, False)

    length = 500
    rb = ReplayBuffer(length,
                      env_dict={"obs": {"shape": (s_dim, 1)},
                      "act": {"shape": (a_dim, 1)},
                      "rew": {},
                      "next_obs": {"shape": (s_dim, 1)},
                      "done": {}})

    x = sim.getState()
    for _ in range(length):
        u = np.random.rand(1, a_dim, 1)
        x_next = sim.step(u)

        rb.add(obs=x, act=u, rew=0, next_obs=x_next, done=False)
        x = x_next


    model = NNModel(dt=0.1, state_dim=s_dim, action_dim=a_dim, name="nn_model")

    stamp = datetime.now().strftime("%Y.%m.%d-%H:%M:%S")
    logdir = "../graphs/test_training/{}".format(stamp)

    writer = tf.summary.create_file_writer(logdir)
    log = True

    epochs = 1000
    for e in range(epochs):
        sample = rb.sample(batch_size)
        gt = sample['next_obs']
        x = sample['obs']
        u = sample['act']
        model.train_step(gt, x, u, e, writer, log)
Exemple #11
0
    def test(self):
        buffer_size = 256
        obs_dim = 3
        act_dim = 1
        rew_dim = 2
        rb = ReplayBuffer(
            buffer_size, {
                "obs": {
                    "shape": obs_dim
                },
                "act": {
                    "shape": act_dim
                },
                "rew": {
                    "shape": rew_dim
                },
                "next_obs": {
                    "shape": obs_dim
                },
                "done": {}
            })

        obs = np.ones(shape=(obs_dim))
        act = np.ones(shape=(act_dim))
        rew = (0, 1)
        next_obs = np.ones(shape=(obs_dim))
        done = 0

        for i in range(500):
            rb.add(obs=obs, act=act, rew=rew, next_obs=next_obs, done=done)

        batch_size = 32
        sample = rb.sample(batch_size)

        self.assertEqual(0, sample["rew"][0, 0])
        self.assertEqual(1, sample["rew"][0, 1])
Exemple #12
0
class Server(Process):
    def __init__(self, size, env_dict, n_step_dict=None, min_storage=10000, done_string="done"):
        super().__init__()
        self.done_string = done_string
        self.queue = Queue()
        self.size = size
        self.client_pipe, self.server_pipe = Pipe()
        self.env_dict = env_dict
        self.n_step_dict = n_step_dict
        self.parameter = None
        self.min_storage = min_storage
        self.cpprb_args = {
            "size": size,
            "env_dict": env_dict,
            "Nstep": n_step_dict
        }

        # Server lock object
        self.lock = Lock()

    def run(self) -> None:
        self.buffer = CPPRB(
            **self.cpprb_args)
        while True:
            cmd, *args = self.queue.get()
            if cmd == "add":
                self._add(*args)
            elif cmd == "sample":
                self.server_pipe.send(self._sample(*args))
            elif cmd == "upload":
                self._upload(*args)
            elif cmd == "download":
                self.server_pipe.send(self._download())
            else:
                raise ValueError(
                    f"Parameter Server got an unexpected command {cmd}")

    def _download(self) -> Any:
        return self.parameter

    def _upload(self, parameter: Any) -> None:
        self.parameter = parameter

    def _add(self, data: Dict[str, Sequence[np.ndarray]]) -> None:
        self.buffer.add(**data)

    def _sample(self, size: int) -> Dict[str, np.ndarray]:
        if self.buffer.get_stored_size() < self.min_storage:
            print(
                f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage\
                     size {self.min_storage}. Returning None.")
            return None
        else:
            return self.buffer.sample(size)

    def download(self) -> Any:
        cmd = "download"
        self.lock.acquire()
        self.queue.put((cmd, None))
        weights = self.client_pipe.recv()
        self.lock.release()
        return weights

    def upload(self, parameter: Any):
        cmd = "upload"
        self.queue.put((cmd, parameter))

    def add(self, data: Sequence[Dict[str, np.ndarray]]):
        cmd = "add"
        self.queue.put((cmd, data))

    def sample(self, size: int) -> Dict[str, np.ndarray]:
        cmd = "sample"
        self.lock.acquire()
        self.queue.put((cmd, size))
        sample = self.client_pipe.recv()
        self.lock.release()
        return sample
Exemple #13
0
class Server(Process):
    def __init__(self, size, env_dict, min_storage=100):
        super().__init__()

        self.queue = Queue()
        self.size = size
        self.client_pipe, self.server_pipe = Pipe()
        self.env_dict = env_dict
        self.parameter = None
        self.min_storage = min_storage

        # サーバーロックオブジェクト
        self.lock = Lock()

    def run(self):
        self.buffer = CPPRB(self.size, env_dict=self.env_dict)
        while True:
            cmd, *args = self.queue.get()
            if cmd == "add":
                self._add(*args)
            elif cmd == "sample":
                self.server_pipe.send(self._sample(*args))
            elif cmd == "upload":
                self._upload(*args)
            elif cmd == "download":
                self.server_pipe.send(self._download())
            else:
                raise ValueError(
                    f"Parameter Server got an unexpected command {cmd}")

    def _download(self):
        return self.parameter

    def _upload(self, parameter):
        self.parameter = parameter

    def _add(self, data):
        for d in data:
            label_array = list(self.env_dict.keys())
            data_dict = {key: value for key, value in zip(label_array, d)}
            self.buffer.add(**data_dict)

    def _sample(self, size):
        if self.buffer.get_stored_size() < self.min_storage:
            print(
                f"stored sample {self.buffer.get_stored_size()} is smaller than mininum storage\
                     size {self.min_storage}. Returning None")
            return None
        else:
            return self.buffer.sample(size)

    def download(self):
        cmd = "download"
        self.lock.acquire()
        self.queue.put((cmd, None))
        weights = self.client_pipe.recv()
        self.lock.release()
        return weights

    def upload(self, parameter):
        cmd = "upload"
        self.queue.put((cmd, parameter))

    def add(self, data):
        cmd = "add"
        self.queue.put((cmd, data))

    def sample(self, size):
        cmd = "sample"
        self.lock.acquire()
        self.queue.put((cmd, size))
        sample = self.client_pipe.recv()
        self.lock.release()
        return sample
Exemple #14
0
    while not done_training.is_set():
        while not exp_queue.empty():
            state = exp_queue.get()
            total_rewards.append(state['rew'])
            global_rb.add(**state)
            del state
        if global_rb.get_stored_size() < params.init_replay:
            continue
        if (datetime.now() - start).seconds > 3:
            mean = np.mean(total_rewards[-100:])
            print(
                f'{frames.value:7,} done: {episodes.value:5} mean: {mean:.3f}')
            start = datetime.now()

        batch = global_rb.sample(params.batch_size)
        optimizer.zero_grad()
        loss = calc_loss_dqn(batch, net, tgt_net, params.gamma, device, False)
        loss.backward()
        optimizer.step()
        del batch
        if frames.value % params.sync_nets == 0:
            tgt_net.sync()

        if mean > 10:
            done_training.set()

    exp_queue.close()
    exp_queue.join_thread()

    for p in procs:
Exemple #15
0
    else:
        Q = tf.squeeze(model(observation.reshape(1, -1)))
        action = np.argmax(Q)

    egreedy = decay_egreedy(egreedy)

    next_observation, reward, done, info = env.step(action)
    rb.add(obs=observation,
           act=action,
           rew=reward,
           next_obs=next_observation,
           done=done)
    observation = next_observation

    # Uniform sampling
    sample = rb.sample(batch_size * m)

    with tf.GradientTape() as tape:
        tape.watch(model.trainable_weights)
        Q = Q_func(model, tf.constant(sample["obs"]),
                   tf.constant(sample["act"].ravel()),
                   tf.constant(env.action_space.n))
        target_Q = tf.stop_gradient(
            target_func(model, target_model, tf.constant(sample['next_obs']),
                        tf.constant(sample["rew"].ravel()),
                        tf.constant(sample["done"].ravel()), discount,
                        tf.constant(env.action_space.n)))
        tf.summary.scalar("Target Q",
                          data=tf.reduce_mean(target_Q),
                          step=n_step)
        absTD = tf.math.abs(target_Q - Q)
Exemple #16
0
class MPCTrainer(Trainer):
    def __init__(self,
                 policy,
                 env,
                 args,
                 reward_fn,
                 buffer_size=int(1e6),
                 n_dynamics_model=1,
                 lr=0.001,
                 **kwargs):
        super().__init__(policy, env, args, **kwargs)

        self.dynamics_buffer = ReplayBuffer(
            **self._prepare_dynamics_buffer_dict(buffer_size=buffer_size))
        self._n_dynamics_model = n_dynamics_model

        # Reward function
        self._reward_fn = reward_fn
        self._prepare_dynamics_model(gpu=args.gpu, lr=lr)

    def _prepare_dynamics_buffer_dict(self, buffer_size):
        # Prepare buffer that stores transitions (s, a, s')
        rb_dict = {
            "size": buffer_size,
            "default_dtype": np.float32,
            "env_dict": {
                "obs": {
                    "shape": get_space_size(self._env.observation_space)
                },
                "next_obs": {
                    "shape": get_space_size(self._env.observation_space)
                },
                "act": {
                    "shape": get_space_size(self._env.action_space)
                }
            }
        }
        return rb_dict

    def _prepare_dynamics_model(self, gpu=0, lr=0.001):
        # Dynamics model
        obs_dim = self._env.observation_space.high.size
        act_dim = self._env.action_space.high.size
        self._dynamics_models = [
            DynamicsModel(input_dim=obs_dim + act_dim,
                          output_dim=obs_dim,
                          gpu=gpu) for _ in range(self._n_dynamics_model)
        ]
        self._optimizers = [
            tf.keras.optimizers.Adam(learning_rate=lr)
            for _ in range(self._n_dynamics_model)
        ]

    def _set_check_point(self, model_dir):
        # Save and restore model
        if isinstance(self._policy, tf.keras.Model):
            super()._set_check_point(model_dir)

    def __call__(self):
        total_steps = 0
        tf.summary.experimental.set_step(total_steps)
        # Gather dataset of random trajectories
        self.logger.info("Ramdomly collect {} samples...".format(
            self._n_random_rollout * self._episode_max_steps))
        self.collect_episodes(n_rollout=self._n_random_rollout)

        for i in range(self._max_iter):
            # Train dynamics f(s, a) according to eq.(2)
            mean_loss = self.fit_dynamics(n_epoch=1)

            total_rew = 0.

            # Collect new sample
            obs = self._env.reset()
            for _ in range(self._episode_max_steps):
                total_steps += 1
                act = self._mpc(obs)
                next_obs, rew, done, _ = self._env.step(act)
                self.dynamics_buffer.add(obs=obs, act=act, next_obs=next_obs)
                total_rew += rew
                if done:
                    break
                obs = next_obs

            tf.summary.experimental.set_step(total_steps)
            tf.summary.scalar("mpc/total_rew", total_rew)
            self.logger.info(
                "iter={0: 3d} total_rew: {1:4.4f} loss: {2:2.8f}".format(
                    i, total_rew, mean_loss))

    def predict_next_state(self, obses, acts):
        obs_diffs = np.zeros_like(obses)
        inputs = np.concatenate([obses, acts], axis=1)
        for dynamics_model in self._dynamics_models:
            obs_diffs += dynamics_model.predict(inputs)
        obs_diffs /= self._n_dynamics_model
        return obses + obs_diffs

    def _mpc(self, obs):
        obses = np.tile(obs, (self._n_sample, 1))
        init_actions = self._policy.get_actions(obses)
        total_rewards = np.zeros(shape=(self._n_sample, ))

        for i in range(self._horizon):
            if i == 0:
                acts = init_actions
            else:
                acts = self._policy.get_actions(obses)
            assert obses.shape[0] == acts.shape[0]
            next_obses = self.predict_next_state(obses, acts)
            rewards = self._reward_fn(obses, acts)
            assert rewards.shape == total_rewards.shape
            total_rewards += rewards
            obses = next_obses

        idx = np.argmax(total_rewards)
        return init_actions[idx]

    def _set_from_args(self, args):
        super()._set_from_args(args)
        self._max_iter = args.max_iter
        self._horizon = args.horizon
        self._n_sample = args.n_sample
        self._n_random_rollout = args.n_random_rollout
        self._batch_size = args.batch_size

    def collect_episodes(self, n_rollout=1):
        for _ in range(n_rollout):
            obs = self._env.reset()
            for _ in range(self._episode_max_steps):
                act = self._policy.get_action(obs)
                next_obs, _, done, _ = self._env.step(act)
                self.dynamics_buffer.add(obs=obs, act=act, next_obs=next_obs)
                obs = next_obs
                if done:
                    break

    @tf.function
    def _fit_dynamics_body(self, inputs, labels):
        losses = []
        for dynamics_model, optimizer in zip(self._dynamics_models,
                                             self._optimizers):
            with tf.GradientTape() as tape:
                predicts = dynamics_model(inputs)
                loss = tf.reduce_mean(0.5 * tf.square(labels - predicts))
            grads = tape.gradient(loss, dynamics_model.trainable_variables)
            optimizer.apply_gradients(
                zip(grads, dynamics_model.trainable_variables))
            losses.append(loss)
        return tf.convert_to_tensor(losses)

    def _make_inputs_output_pairs(self, n_epoch):
        samples = self.dynamics_buffer.sample(
            self.dynamics_buffer.get_stored_size())
        inputs = np.concatenate([samples["obs"], samples["act"]], axis=1)
        labels = samples["next_obs"] - samples["obs"]

        return inputs, labels

    def fit_dynamics(self, n_epoch=1):
        inputs, labels = self._make_inputs_output_pairs(n_epoch)

        dataset = tf.data.Dataset.from_tensor_slices((inputs, labels))
        dataset = dataset.batch(self._batch_size)
        dataset = dataset.shuffle(buffer_size=1000)
        dataset = dataset.repeat(n_epoch)

        mean_losses = np.zeros(shape=(self._n_dynamics_model, ),
                               dtype=np.float32)
        for batch, (x, y) in enumerate(dataset):
            _mean_losses = self._fit_dynamics_body(x, y)
            mean_losses += _mean_losses.numpy()
        mean_losses /= (batch + 1)

        for model_idx, mean_loss in enumerate(mean_losses):
            tf.summary.scalar("mpc/model_{}_loss".format(model_idx), mean_loss)
        return np.mean(mean_losses)

    @staticmethod
    def get_argument(parser=None):
        parser = Trainer.get_argument(parser)
        parser.add_argument('--gpu', type=int, default=0, help='GPU id')
        parser.add_argument("--max-iter", type=int, default=100)
        parser.add_argument("--horizon", type=int, default=20)
        parser.add_argument("--n-sample", type=int, default=1000)
        parser.add_argument("--n-random-rollout", type=int, default=1000)
        parser.add_argument("--batch-size", type=int, default=512)
        return parser
Exemple #17
0
def explorer(global_rb,
             queue,
             trained_steps,
             is_training_done,
             lock,
             env_fn,
             policy_fn,
             set_weights_fn,
             noise_level,
             n_env=64,
             n_thread=4,
             buffer_size=1024,
             episode_max_steps=1000,
             gpu=0):
    """
    Collect transitions and store them to prioritized replay buffer.

    :param global_rb (multiprocessing.managers.AutoProxy[PrioritizedReplayBuffer]):
        Prioritized replay buffer sharing with multiple explorers and only one learner.
        This object is shared over processes, so it must be locked when trying to
        operate something with `lock` object.
    :param queue (multiprocessing.Queue):
        A FIFO shared with the `learner` and `evaluator` to get the latest network weights.
        This is process safe, so you don't need to lock process when use this.
    :param trained_steps (multiprocessing.Value):
        Number of steps to apply gradients.
    :param is_training_done (multiprocessing.Event):
        multiprocessing.Event object to share the status of training.
    :param lock (multiprocessing.Lock):
        multiprocessing.Lock to lock other processes.
    :param env_fn (function):
        Method object to generate an environment.
    :param policy_fn (function):
        Method object to generate an explorer.
    :param set_weights_fn (function):
        Method object to set network weights gotten from queue.
    :param noise_level (float):
        Noise level for exploration. For epsilon-greedy policy like DQN variants,
        this will be epsilon, and if DDPG variants this will be variance for Normal distribution.
    :param n_env (int):
        Number of environments to distribute. If this is set to be more than 1,
        `MultiThreadEnv` will be used.
    :param n_thread (int):
        Number of thread used in `MultiThreadEnv`.
    :param buffer_size (int):
        Size of local buffer. If this is filled with transitions, add them to `global_rb`
    :param episode_max_steps (int):
        Maximum number of steps of an episode.
    :param gpu (int):
        GPU id. If this is set to -1, then this process uses only CPU.
    """
    import_tf()
    logger = logging.getLogger("tf2rl")

    if n_env > 1:
        envs = MultiThreadEnv(env_fn=env_fn,
                              batch_size=n_env,
                              thread_pool=n_thread,
                              max_episode_steps=episode_max_steps)
        env = envs._sample_env
    else:
        env = env_fn()

    policy = policy_fn(env=env,
                       name="Explorer",
                       memory_capacity=global_rb.get_buffer_size(),
                       noise_level=noise_level,
                       gpu=gpu)

    kwargs = get_default_rb_dict(buffer_size, env)
    if n_env > 1:
        kwargs["env_dict"]["priorities"] = {}
    local_rb = ReplayBuffer(**kwargs)

    if n_env == 1:
        s = env.reset()
        episode_steps = 0
        total_reward = 0.
        total_rewards = []
    else:
        obses = envs.py_reset()
    start = time.time()
    n_sample, n_sample_old = 0, 0

    while not is_training_done.is_set():
        if n_env == 1:
            n_sample += 1
            episode_steps += 1
            a = policy.get_action(s)
            s_, r, done, _ = env.step(a)
            done_flag = done
            if episode_steps == env._max_episode_steps:
                done_flag = False
            total_reward += r
            local_rb.add(obs=s, act=a, rew=r, next_obs=s_, done=done_flag)

            s = s_
            if done or episode_steps == episode_max_steps:
                s = env.reset()
                total_rewards.append(total_reward)
                total_reward = 0
                episode_steps = 0
        else:
            n_sample += n_env
            obses = envs.py_observation()
            actions = policy.get_action(obses, tensor=True)
            next_obses, rewards, dones, _ = envs.step(actions)
            td_errors = policy.compute_td_error(states=obses,
                                                actions=actions,
                                                next_states=next_obses,
                                                rewards=rewards,
                                                dones=dones)
            local_rb.add(obs=obses,
                         act=actions,
                         next_obs=next_obses,
                         rew=rewards,
                         done=dones,
                         priorities=np.abs(td_errors + 1e-6))

        # Periodically copy weights of explorer
        if not queue.empty():
            set_weights_fn(policy, queue.get())

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size:
            samples = local_rb.sample(local_rb.get_stored_size())
            if n_env > 1:
                priorities = np.squeeze(samples["priorities"])
            else:
                td_errors = policy.compute_td_error(
                    states=samples["obs"],
                    actions=samples["act"],
                    next_states=samples["next_obs"],
                    rewards=samples["rew"],
                    dones=samples["done"])
                priorities = np.abs(np.squeeze(td_errors)) + 1e-6
            lock.acquire()
            global_rb.add(obs=samples["obs"],
                          act=samples["act"],
                          rew=samples["rew"],
                          next_obs=samples["next_obs"],
                          done=samples["done"],
                          priorities=priorities)
            lock.release()
            local_rb.clear()

            msg = "Grad: {0: 6d}\t".format(trained_steps.value)
            msg += "Samples: {0: 7d}\t".format(n_sample)
            msg += "TDErr: {0:.5f}\t".format(np.average(priorities))
            if n_env == 1:
                ave_rew = 0 if len(total_rewards) == 0 else \
                    sum(total_rewards) / len(total_rewards)
                msg += "AveEpiRew: {0:.3f}\t".format(ave_rew)
                total_rewards = []
            msg += "FPS: {0:.2f}".format(
                (n_sample - n_sample_old) / (time.time() - start))
            logger.info(msg)

            start = time.time()
            n_sample_old = n_sample
Exemple #18
0
class SAC:
    """
    Soft Actor Critic
    Ref: https://arxiv.org/pdf/1812.05905.pdf
    """
    def __init__(self,
                 observation_space,
                 action_space,
                 replay_size=int(1e6),
                 gamma=0.99,
                 tau=0.05,
                 lr=3e-4,
                 alpha=0.2,
                 target_update_interval=1,
                 device='cuda'):

        self.gamma = gamma
        self.tau = tau
        self.alpha = alpha
        self.target_update_interval = target_update_interval
        self.device = device
        self.logger = Logger()

        # Experience replay
        rb_kwargs = get_default_rb_dict(observation_space.shape,
                                        action_space.shape, replay_size)
        self.rb = ReplayBuffer(**rb_kwargs)

        # critic
        self.critic = CriticCNN(obs_dim=observation_space.shape[0],
                                act_dim=action_space.shape[0]).to(self.device)
        self.critic_opt = Adam(self.critic.parameters(), lr=lr)

        # critic target
        self.critic_target = CriticCNN(obs_dim=observation_space.shape[0],
                                       act_dim=action_space.shape[0]).to(
                                           self.device)
        self.critic_target.hard_update(self.critic)

        # actor
        self.actor = ActorCNN(obs_dim=observation_space.shape[0],
                              act_dim=action_space.shape[0],
                              action_space=action_space).to(self.device)
        self.actor_opt = Adam(self.actor.parameters(), lr=lr)

        self.target_entropy = -torch.prod(
            torch.Tensor(action_space.shape).to(self.device)).item()
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.alpha_opt = Adam([self.log_alpha], lr=lr)

    def select_action(self, obs, evaluate=False):
        obs = torch.FloatTensor(obs).to(self.device).unsqueeze(0)
        if evaluate is False:
            action, _, _ = self.actor.sample(obs)
        else:
            _, _, action = self.actor.sample(obs)
        return action.detach().cpu().numpy()[0]

    def compute_td_error(self, obs, act, next_obs, rew, done):
        with torch.no_grad():
            next_act, next_log_prob, _ = self.actor.sample(next_obs)
            target_q1, target_q2 = self.critic_target(next_obs, next_act)
            target_q = torch.min(target_q1,
                                 target_q2) - self.alpha * next_log_prob
            target_q = rew + ((1 - done) * self.gamma * target_q)

        current_q1, current_q2 = self.critic(obs, act)

        td_error1 = current_q1 - target_q
        td_error2 = current_q2 - target_q

        return td_error1, td_error2

    def critic_loss(self, obs, act, next_obs, rew, done):
        td_error1, td_error2 = self.compute_td_error(obs, act, next_obs, rew,
                                                     done)

        # MSE
        loss1 = huber_loss(td_error1).mean()
        loss2 = huber_loss(td_error2).mean()

        return loss1 + loss2

    def actor_alpha_loss(self, obs):

        act, log_prob, _ = self.actor.sample(obs)

        current_q1, current_q2 = self.critic(obs, act)
        min_q = torch.min(current_q1, current_q2)

        actor_loss = ((self.alpha * log_prob) - min_q).mean()

        # alpha loss
        alpha_loss = -(self.log_alpha *
                       (log_prob + self.target_entropy).detach()).mean()

        return actor_loss, alpha_loss

    def update_critic(self, obs, act, next_obs, rew, done):
        loss = self.critic_loss(obs, act, next_obs, rew, done)

        # update q1
        self.critic_opt.zero_grad()
        loss.backward(retain_graph=True)
        self.critic_opt.step()

        return loss

    def update_actor_alpha(self, obs):
        actor_loss, alpha_loss = self.actor_alpha_loss(obs)

        # update actor
        self.actor_opt.zero_grad()
        actor_loss.backward()
        self.actor_opt.step()

        # update alpha
        self.alpha_opt.zero_grad()
        alpha_loss.backward()
        self.alpha_opt.step()

        return actor_loss, alpha_loss

    def update_parameters(self, batch_size, updates):

        batch = self.rb.sample(batch_size)

        # to tensor
        obs = torch.FloatTensor(batch['obs']).to(self.device)
        act = torch.FloatTensor(batch['act']).to(self.device)
        next_obs = torch.FloatTensor(batch['next_obs']).to(self.device)
        rew = torch.FloatTensor(batch['rew']).to(self.device)
        done = torch.FloatTensor(batch['done']).to(self.device)

        # update actor & critic & alpha
        critic_loss = self.update_critic(obs, act, next_obs, rew, done)
        actor_loss, alpha_loss = self.update_actor_alpha(obs)

        # apply alpha
        self.alpha = self.log_alpha.exp()

        # update target network
        if updates % self.target_update_interval == 0:
            self.critic_target.soft_update(self.critic, self.tau)

        return critic_loss, actor_loss, alpha_loss, self.alpha.clone()

    def load_model(self, actor, critic):
        self.actor = actor
        self.critic = critic
Exemple #19
0
class Agent:
    def __init__(self,
                 lr,
                 state_shape,
                 num_actions,
                 batch_size,
                 max_mem_size=1000):
        self.lr = lr
        self.gamma = 0.99
        self.action_space = list(range(num_actions))
        self.batch_size = batch_size
        self.target_update_interval = 200
        self.step_count = 0

        self.epsilon = Lerper(start=1.0, end=0.01, num_steps=2000)

        self.memory = ReplayBuffer(
            max_mem_size, {
                "obs": {
                    "shape": state_shape
                },
                "act": {
                    "shape": 1
                },
                "rew": {},
                "next_obs": {
                    "shape": state_shape
                },
                "done": {
                    "shape": 1
                }
            })

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        # self.device = torch.device("cpu")

        self.V_MIN, self.V_MAX = 0, 200
        self.NUM_ATOMS = 4
        self.support = torch.linspace(self.V_MIN, self.V_MAX,
                                      self.NUM_ATOMS).to(self.device)
        self.net = Network(lr, state_shape, num_actions, self.support,
                           self.NUM_ATOMS).to(self.device)
        self.net_ = Network(lr, state_shape, num_actions, self.support,
                            self.NUM_ATOMS).to(self.device)

        self.net_.load_state_dict(self.net.state_dict())

    def choose_action(self, observation):
        if np.random.random() > self.epsilon.value():
            state = torch.tensor(observation).float().detach()
            state = state.to(self.device)
            state = state.unsqueeze(0)

            q_values = self.net(state)
            action = torch.argmax(q_values).item()
            return action
        else:
            return np.random.choice(self.action_space)

    def store_memory(self, state, action, reward, next_state, done):
        self.memory.add(obs=state,
                        act=action,
                        rew=reward,
                        next_obs=next_state,
                        done=done)

    def learn(self):
        if self.memory.get_stored_size() < self.batch_size:
            return

        batch = self.memory.sample(self.batch_size)

        states = torch.tensor(batch["obs"]).to(self.device)
        actions = torch.tensor(batch["act"],
                               dtype=torch.int64).to(self.device).T[0]
        rewards = torch.tensor(batch["rew"]).to(self.device)
        states_ = torch.tensor(batch["next_obs"]).to(self.device)
        dones = torch.tensor(batch["done"],
                             dtype=torch.float32).to(self.device)

        batch_index = np.arange(self.batch_size, dtype=np.int64)

        #   the difference between each reward quanta
        delta_z = float(self.V_MAX - self.V_MIN) / (self.NUM_ATOMS - 1
                                                    )  #28.571428571428573

        with torch.no_grad():
            qs_ = self.net_(states_)  #[64,2]
            actions_ = qs_.argmax(dim=1)  #[64]
            dists_ = self.net_.dist(states_)  #[64,2,8]
            action_dist_ = dists_[batch_index, actions_]  #[64,8]

            # print(action_dist_)
            # print(action_dist_.shape)
            # quit()

            #done    #[64,1]
            #reward  #[64,1]
            #support #[51]
            print("support")
            print(self.support)
            print(self.support.shape)
            t_z = rewards + (
                1 - dones) * self.gamma * self.support  #   shape=[64,8]
            # t_z = torch.tensor((self.batch_size,)).to(self.device) * self.support
            t_z = torch.zeros(
                (self.batch_size, self.NUM_ATOMS)).to(self.device)
            tzindxs = np.arange(6)
            t_z[tzindxs] = self.support

            print("t-z")
            print(t_z)
            print(t_z.shape)
            # quit()

            #   normalization bullshit
            t_z = t_z.clamp(min=self.V_MIN, max=self.V_MAX)
            b = (t_z - self.V_MIN) / delta_z  #   quantize
            l = b.floor().long()  #   indices
            u = b.ceil().long()  #   offsets to the closest reward bracket

            print(t_z)
            print(t_z.shape)
            # quit()

            print(b)
            print(b.shape)
            print(l)
            print(l.shape)
            print(u)
            print(u.shape)
            # quit()

            #   this is a giant indexing array
            offset = (  #[64,8] #[[0..0],[8..8],[16..16],,,[504..504]
                torch.linspace(0, (self.batch_size - 1) * self.NUM_ATOMS,
                               self.batch_size).long().unsqueeze(1).expand(
                                   self.batch_size,
                                   self.NUM_ATOMS).to(self.device))

            print("\noffset")
            print(offset)
            print(offset.shape)

            frac = u.float() - b  #   percentages, decreasing, axis = 1
            dec_frac = b - l.float()  #   percentages, increasing, axis = 1

            # print(something_else)
            # print(something_else.shape)
            # quit()

            action_dist_ = torch.ones(
                (self.batch_size, self.NUM_ATOMS)).to(self.device)

            proj_dist = torch.zeros(action_dist_.size(),
                                    device=self.device)  #   [64,8]

            print("proj_dist")
            print(proj_dist)
            print(proj_dist.shape)

            print("action_dist_")
            print(action_dist_)
            print(action_dist_.shape)
            # print(frac)
            # print(frac.shape)

            print("l")
            print(l)
            print(l.shape)

            print("offset")
            print(offset)
            print(offset.shape)

            proj_dist.view(-1).index_add_(  #[64,8]
                0,
                (l + offset).view(-1),
                (action_dist_).view(-1)  #(action_dist_ * frac).view(-1)
            )
            print("RESULT: proj_dist")
            print(proj_dist)
            print(proj_dist.shape)
            proj_dist.view(-1).index_add_(  #[64,8]
                0,
                (u + offset).view(-1),
                (action_dist_).view(-1)  #(action_dist_ * dec_frac).view(-1)
            )

            print("proj_dist")
            print(proj_dist)
            print(proj_dist.shape)
            quit()

            # print(dec_frac)
            # print(dec_frac.shape)
            # quit()

        # print(actions)
        # print(actions.shape)
        # quit()

        dists = self.net.dist(states)  #[64,2,8]
        log_p = torch.log(dists[batch_index, actions])

        loss = -(proj_dist * log_p).sum(1).mean()

        self.net.optimizer.zero_grad()
        loss.backward()
        self.net.optimizer.step()

        self.epsilon.step()

        self.step_count += 1

        if self.step_count % self.target_update_interval == 0:
            print("targnet update!!")
            self.net_.load_state_dict(self.net.state_dict())

        return loss
Exemple #20
0
 
 time_step = env.reset()
 state = np.concatenate( [ time_step.observation[key] 
                          for key in list( time_step.observation.keys() ) ] )
 score = 0
 
 for t in range(int(max_t)):      
     action = agent.get_action(state)
     time_step = env.step(action)
     reward, done = time_step.reward, time_step.last()
     next_state = np.concatenate( [ time_step.observation[key] 
                                   for key in list( time_step.observation.keys() ) ] )
     
     # Learn, if enough samples are available in memory
     if rb.get_stored_size() > BATCH_SIZE:
         data = rb.sample(BATCH_SIZE)                
         states = data['obs']; actions = data['act']; rewards = data['rew']
         next_states = data['next_obs']; dones = data['done']
         
         actor_loss, critic_loss, _ = agent.train(states, 
                                                  actions, 
                                                  next_states, 
                                                  rewards, 
                                                  dones)
         with summary_writer.as_default():
             tf.summary.scalar(name="actor_loss",
                               data=actor_loss,
                               step=t)
             tf.summary.scalar(name="critic_loss",
                               data=critic_loss,
                               step=t)
Exemple #21
0
def explorer(global_rb, queue, trained_steps, n_transition,
             is_training_done, lock, env_fn, policy_fn,
             buffer_size=1024, max_transition=None,
             episode_max_steps=1000):
    """
    Collect transitions and store them to prioritized replay buffer.
    Args:
        global_rb:
            Prioritized replay buffer sharing with multiple explorers and only one learner.
            This object is shared over processes, so it must be locked when trying to
            operate something with `lock` object.
        queue:
            A FIFO shared with the learner to get latest network parameters.
            This is process safe, so you don't need to lock process when use this.
        trained_steps:
            Number of steps to apply gradients.
        n_transition:
            Number of collected transitions.
        is_training_done:
            multiprocessing.Event object to share the status of training.
        lock:
            multiprocessing.Lock to lock other processes. You must release after process is done.
        env_fn:
            Method object to generate an environment.
        policy_fn:
            Method object to generate an explorer.
        buffer_size:
            Size of local buffer. If it is filled with transitions, add them to `global_rb`
        max_transition:
            Maximum number of steps to explorer. Default value is None.
        episode_max_steps:
            Maximum number of steps of an episode.
    """
    env = env_fn()
    policy = policy_fn(env, "Explorer", global_rb.get_buffer_size())
    local_rb = ReplayBuffer(obs_shape=env.observation_space.shape,
                            act_dim=env.action_space.low.size,
                            size=buffer_size)

    s = env.reset()
    episode_steps = 0
    total_reward = 0.
    total_rewards = []
    start = time.time()
    sample_at_start = 0

    while not is_training_done.is_set():
        # Periodically copy weights of explorer
        if not queue.empty():
            actor_weights, critic_weights, critic_target_weights = queue.get()
            update_target_variables(policy.actor.weights, actor_weights, tau=1.)
            update_target_variables(policy.critic.weights, critic_weights, tau=1.)
            update_target_variables(policy.critic_target.weights, critic_target_weights, tau=1.)

        n_transition.value += 1
        episode_steps += 1
        a = policy.get_action(s)
        s_, r, done, _ = env.step(a)
        done_flag = done
        if episode_steps == env._max_episode_steps:
            done_flag = False
        total_reward += r
        local_rb.add(s, a, r, s_, done_flag)

        s = s_
        if done or episode_steps == episode_max_steps:
            s = env.reset()
            total_rewards.append(total_reward)
            total_reward = 0
            episode_steps = 0

        # Add collected experiences to global replay buffer
        if local_rb.get_stored_size() == buffer_size - 1:
            temp_n_transition = n_transition.value
            samples = local_rb.sample(local_rb.get_stored_size())
            states, next_states, actions, rewards, done = samples["obs"], samples["next_obs"], samples["act"], samples["rew"], samples["done"]
            done = np.array(done, dtype=np.float64)
            td_errors = policy.compute_td_error(
                states, actions, next_states, rewards, done)
            print("Grad: {0: 6d}\tSamples: {1: 7d}\tTDErr: {2:.5f}\tAveEpiRew: {3:.3f}\tFPS: {4:.2f}".format(
                trained_steps.value, n_transition.value, np.average(np.abs(td_errors).flatten()),
                sum(total_rewards) / len(total_rewards), (temp_n_transition - sample_at_start) / (time.time() - start)))
            total_rewards = []
            lock.acquire()
            global_rb.add(
                states, actions, rewards, next_states, done,
                priorities=np.abs(td_errors)+1e-6)
            lock.release()
            local_rb.clear()
            start = time.time()
            sample_at_start = n_transition.value

        if max_transition is not None and n_transition.value >= max_transition:
            is_training_done.set()
Exemple #22
0
    if use_prioritized_rb and use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        return NstepPrioritizedReplayBuffer(**kwargs)

    # prioritized
    if use_prioritized_rb:
        return PrioritizedReplayBuffer(**kwargs)

    # N-step
    if use_nstep_rb:
        kwargs["n_step"] = n_step
        kwargs["discount"] = policy.discount
        return NstepReplayBuffer(**kwargs)

    if isinstance(kwargs["act_dim"], tuple):
        kwargs["act_dim"] = kwargs["act_dim"][0]
    return ReplayBuffer(**kwargs)


if __name__ == '__main__':
    from cpprb import ReplayBuffer
    import numpy as np

    rb = ReplayBuffer(obs_dim=3, act_dim=3, size=10)
    for i in range(10):
        obs_act = np.array([i for _ in range(3)], dtype=np.float64)
        print(obs_act)
        rb.add(obs=obs_act, act=obs_act, next_obs=obs_act, rew=float(i), done=False)
    print(rb.sample(10))