Esempio n. 1
0
    def _initialize(self):
        """Initialize non-common things."""
        self.per_beta = self.hyper_params.per_beta

        self.use_n_step = self.hyper_params.n_step > 1

        if not self.args.test:
            # load demo replay memory
            with open(self.args.demo_path, "rb") as f:
                demos = pickle.load(f)

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                # replay memory for multi-steps
                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory for a single step
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

        self.learner_cfg.type = "DDPGfDLearner"
        self.learner = build_learner(self.learner_cfg)
Esempio n. 2
0
    def _initialize(self):
        """Initialize non-common things."""
        if not self.args.test:

            self.memory = RecurrentReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                self.hyper_params.sequence_size,
                self.hyper_params.overlap_size,
                n_step=self.hyper_params.n_step,
                gamma=self.hyper_params.gamma,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

            # replay memory for multi-steps
            if self.use_n_step:
                self.memory_n = RecurrentReplayBuffer(
                    self.hyper_params.buffer_size,
                    self.hyper_params.batch_size,
                    self.hyper_params.sequence_size,
                    self.hyper_params.overlap_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                )

        self.learner = build_learner(self.learner_cfg)
Esempio n. 3
0
    def _initialize(self):
        """Initialize non-common things."""
        if not self.is_test:
            # replay memory for a single step
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

            # replay memory for multi-steps
            if self.use_n_step:
                self.memory_n = ReplayBuffer(
                    self.hyper_params.buffer_size,
                    self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner = build_learner(self.learner_cfg, build_args)
Esempio n. 4
0
    def _initialize(self):
        """Initialize non-common things."""
        if not self.args.test:
            # load demo replay memory
            demos = self._load_demos()

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                demo=demos,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory,
                alpha=self.hyper_params.per_alpha,
                epsilon_d=self.hyper_params.per_eps_demo,
            )

        self.learner_cfg.type = "DQfDLearner"
        self.learner = build_learner(self.learner_cfg)
Esempio n. 5
0
    def _spawn(self):
        """Intialize distributed worker, learner and centralized replay buffer."""
        replay_buffer = ReplayBuffer(
            self.hyper_params.buffer_size,
            self.hyper_params.batch_size,
        )
        per_buffer = PrioritizedBufferWrapper(
            replay_buffer, alpha=self.hyper_params.per_alpha)
        self.global_buffer = ApeXBufferWrapper.remote(per_buffer, self.args,
                                                      self.hyper_params,
                                                      self.comm_cfg)

        learner = build_learner(self.learner_cfg)
        self.learner = ApeXLearnerWrapper.remote(learner, self.comm_cfg)

        state_dict = learner.get_state_dict()
        worker_build_args = dict(args=self.args, state_dict=state_dict)

        self.workers = []
        self.num_workers = self.hyper_params.num_workers
        for rank in range(self.num_workers):
            worker_build_args["rank"] = rank
            worker = build_worker(self.worker_cfg,
                                  build_args=worker_build_args)
            apex_worker = ApeXWorkerWrapper.remote(worker, self.args,
                                                   self.comm_cfg)
            self.workers.append(apex_worker)

        self.logger = build_logger(self.logger_cfg)

        self.processes = self.workers + [
            self.learner, self.global_buffer, self.logger
        ]
def generate_prioritized_buffer(
    buffer_length: int, batch_size: int, idx_lst=None, prior_lst=None
) -> Tuple[PrioritizedBufferWrapper, List]:
    """Generate Prioritized Replay Buffer with random Prior."""
    buffer = ReplayBuffer(max_len=buffer_length, batch_size=batch_size)
    prioritized_buffer = PrioritizedBufferWrapper(buffer)
    priority = np.random.randint(10, size=buffer_length)
    for i, j in enumerate(priority):
        prioritized_buffer.sum_tree[i] = j
    if idx_lst:
        for i, j in list(zip(idx_lst, prior_lst)):
            priority[i] = j
            prioritized_buffer.sum_tree[i] = j

    prop_lst = [i / sum(priority) for i in priority]

    return prioritized_buffer, prop_lst
Esempio n. 7
0
    def _initialize(self):
        """Initialize non-common things."""
        self.per_beta = self.hyper_params.per_beta
        self.use_n_step = self.hyper_params.n_step > 1

        if not self.is_test:
            # load demo replay memory
            with open(self.hyper_params.demo_path, "rb") as f:
                demos = pickle.load(f)

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                # replay memory for multi-steps
                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory for a single step
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                demo=demos,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory,
                alpha=self.hyper_params.per_alpha,
                epsilon_d=self.hyper_params.per_eps_demo,
            )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            noise_cfg=self.noise_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.shape[0],
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner = build_learner(self.learner_cfg, build_args)
Esempio n. 8
0
    def _initialize(self):
        """Initialize non-common things."""
        if not self.is_test:
            # load demo replay memory
            demos = self._load_demos()

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                demo=demos,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory,
                alpha=self.hyper_params.per_alpha,
                epsilon_d=self.hyper_params.per_eps_demo,
            )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner_cfg.type = "DQfDLearner"
        self.learner = build_learner(self.learner_cfg, build_args)
def sample_dummy(prioritized_buffer: PrioritizedBufferWrapper, times: int) -> List:
    """Sample from prioritized buffer and Return indices."""
    assert isinstance(prioritized_buffer, PrioritizedBufferWrapper)

    sampled_lst = [0] * prioritized_buffer.buffer.max_len
    for _ in range(times):
        indices = prioritized_buffer._sample_proportional(
            prioritized_buffer.buffer.batch_size
        )
        for idx in indices:
            sampled_lst[idx] += 1 / (times * prioritized_buffer.buffer.batch_size)
    return sampled_lst
Esempio n. 10
0
class DDPGfDAgent(DDPGAgent):
    """ActorCritic interacting with environment.

    Attributes:
        memory (PrioritizedReplayBuffer): replay memory
        per_beta (float): beta parameter for prioritized replay buffer
        use_n_step (bool): whether or not to use n-step returns

    """

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        self.per_beta = self.hyper_params.per_beta

        self.use_n_step = self.hyper_params.n_step > 1

        if not self.args.test:
            # load demo replay memory
            with open(self.args.demo_path, "rb") as f:
                demos = pickle.load(f)

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                # replay memory for multi-steps
                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory for a single step
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

        self.learner_cfg.type = "DDPGfDLearner"
        self.learner = build_learner(self.learner_cfg)

    def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
        """Add 1 step and n step transitions to memory."""
        # add n-step transition
        if self.use_n_step:
            transition = self.memory_n.add(transition)

        # add a single step transition
        # if transition is not an empty tuple
        if transition:
            self.memory.add(transition)

    def sample_experience(self) -> Tuple[torch.Tensor, ...]:
        experience_1 = self.memory.sample(self.per_beta)
        if self.use_n_step:
            indices = experience_1[-2]
            experience_n = self.memory_n.sample(indices)
            return numpy2floattensor(experience_1), numpy2floattensor(
                experience_n)

        return numpy2floattensor(experience_1)

    def pretrain(self):
        """Pretraining steps."""
        pretrain_loss = list()
        pretrain_step = self.hyper_params.pretrain_step
        print("[INFO] Pre-Train %d step." % pretrain_step)
        for i_step in range(1, pretrain_step + 1):
            t_begin = time.time()
            experience = self.sample_experience()
            info = self.learner.update_model(experience)
            loss = info[0:2]
            t_end = time.time()
            pretrain_loss.append(loss)  # for logging

            # logging
            if i_step == 1 or i_step % 100 == 0:
                avg_loss = np.vstack(pretrain_loss).mean(axis=0)
                pretrain_loss.clear()
                log_value = (0, avg_loss, 0, t_end - t_begin)
                self.write_log(log_value)
        print("[INFO] Pre-Train Complete!\n")

    def train(self):
        """Train the agent."""
        # logger
        if self.args.log:
            self.set_wandb()
            # wandb.watch([self.actor, self.critic], log="parameters")

        # pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.args.episode_num + 1):
            state = self.env.reset()
            done = False
            score = 0
            self.episode_step = 0
            losses = list()

            t_begin = time.time()

            while not done:
                if self.args.render and self.i_episode >= self.args.render_after:
                    self.env.render()

                action = self.select_action(state)
                next_state, reward, done, _ = self.step(action)
                self.total_step += 1
                self.episode_step += 1

                if len(self.memory) >= self.hyper_params.batch_size:
                    for _ in range(self.hyper_params.multiple_update):
                        experience = self.sample_experience()
                        info = self.learner.update_model(experience)
                        loss = info[0:2]
                        indices, new_priorities = info[2:4]
                        losses.append(loss)  # for logging
                        self.memory.update_priorities(indices, new_priorities)

                # increase priority beta
                fraction = min(
                    float(self.i_episode) / self.args.episode_num, 1.0)
                self.per_beta = self.per_beta + fraction * (1.0 -
                                                            self.per_beta)

                state = next_state
                score += reward

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            # logging
            if losses:
                avg_loss = np.vstack(losses).mean(axis=0)
                log_value = (self.i_episode, avg_loss, score, avg_time_cost)
                self.write_log(log_value)
                losses.clear()

            if self.i_episode % self.args.save_period == 0:
                self.learner.save_params(self.i_episode)
                self.interim_test()

        # termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()
Esempio n. 11
0
class DQNAgent(Agent):
    """DQN interacting with environment.

    Attribute:
        env (gym.Env): openAI Gym environment
        hyper_params (ConfigDict): hyper-parameters
        log_cfg (ConfigDict): configuration for saving log and checkpoint
        network_cfg (ConfigDict): config of network for training agent
        optim_cfg (ConfigDict): config of optimizer
        state_dim (int): state size of env
        action_dim (int): action size of env
        memory (PrioritizedReplayBuffer): replay memory
        curr_state (np.ndarray): temporary storage of the current state
        total_step (int): total step number
        episode_step (int): step number of the current episode
        i_episode (int): current episode number
        epsilon (float): parameter for epsilon greedy policy
        n_step_buffer (deque): n-size buffer to calculate n-step returns
        per_beta (float): beta parameter for prioritized replay buffer
        use_n_step (bool): whether or not to use n-step returns

    """
    def __init__(
        self,
        env: gym.Env,
        env_info: ConfigDict,
        hyper_params: ConfigDict,
        learner_cfg: ConfigDict,
        log_cfg: ConfigDict,
        is_test: bool,
        load_from: str,
        is_render: bool,
        render_after: int,
        is_log: bool,
        save_period: int,
        episode_num: int,
        max_episode_steps: int,
        interim_test_num: int,
    ):
        """Initialize."""
        Agent.__init__(
            self,
            env,
            env_info,
            log_cfg,
            is_test,
            load_from,
            is_render,
            render_after,
            is_log,
            save_period,
            episode_num,
            max_episode_steps,
            interim_test_num,
        )

        self.curr_state = np.zeros(1)
        self.episode_step = 0
        self.i_episode = 0

        self.hyper_params = hyper_params
        self.learner_cfg = learner_cfg

        self.per_beta = hyper_params.per_beta
        self.use_n_step = hyper_params.n_step > 1

        if self.learner_cfg.head.configs.use_noisy_net:
            self.max_epsilon = 0.0
            self.min_epsilon = 0.0
            self.epsilon = 0.0
        else:
            self.max_epsilon = hyper_params.max_epsilon
            self.min_epsilon = hyper_params.min_epsilon
            self.epsilon = hyper_params.max_epsilon

        self._initialize()

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        if not self.is_test:
            # replay memory for a single step
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

            # replay memory for multi-steps
            if self.use_n_step:
                self.memory_n = ReplayBuffer(
                    self.hyper_params.buffer_size,
                    self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner = build_learner(self.learner_cfg, build_args)

    def select_action(self, state: np.ndarray) -> np.ndarray:
        """Select an action from the input space."""
        self.curr_state = state

        # epsilon greedy policy
        if not self.is_test and self.epsilon > np.random.random():
            selected_action = np.array(self.env.action_space.sample())
        else:
            with torch.no_grad():
                state = self._preprocess_state(state)
                selected_action = self.learner.dqn(state).argmax()
            selected_action = selected_action.detach().cpu().numpy()
        return selected_action

    # pylint: disable=no-self-use
    def _preprocess_state(self, state: np.ndarray) -> torch.Tensor:
        """Preprocess state so that actor selects an action."""
        state = numpy2floattensor(state, self.learner.device)
        return state

    def step(self,
             action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool, dict]:
        """Take an action and return the response of the env."""
        next_state, reward, done, info = self.env.step(action)

        if not self.is_test:
            # if the last state is not a terminal state, store done as false
            done_bool = False if self.episode_step == self.max_episode_steps else done

            transition = (self.curr_state, action, reward, next_state,
                          done_bool)
            self._add_transition_to_memory(transition)

        return next_state, reward, done, info

    def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
        """Add 1 step and n step transitions to memory."""
        # add n-step transition
        if self.use_n_step:
            transition = self.memory_n.add(transition)

        # add a single step transition
        # if transition is not an empty tuple
        if transition:
            self.memory.add(transition)

    def write_log(self, log_value: tuple):
        """Write log about loss and score"""
        i, loss, score, avg_time_cost = log_value
        print(
            "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n"
            "epsilon: %f, loss: %f, avg q-value: %f (spent %.6f sec/step)\n" %
            (
                i,
                self.episode_step,
                self.total_step,
                score,
                self.epsilon,
                loss[0],
                loss[1],
                avg_time_cost,
            ))

        if self.is_log:
            wandb.log({
                "score": score,
                "epsilon": self.epsilon,
                "dqn loss": loss[0],
                "avg q values": loss[1],
                "time per each step": avg_time_cost,
                "total_step": self.total_step,
            })

    # pylint: disable=no-self-use, unnecessary-pass
    def pretrain(self):
        """Pretraining steps."""
        pass

    def sample_experience(self) -> Tuple[torch.Tensor, ...]:
        """Sample experience from replay buffer."""
        experiences_1 = self.memory.sample(self.per_beta)
        experiences_1 = (
            numpy2floattensor(experiences_1[:6], self.learner.device) +
            experiences_1[6:])

        if self.use_n_step:
            indices = experiences_1[-2]
            experiences_n = self.memory_n.sample(indices)
            return (
                experiences_1,
                numpy2floattensor(experiences_n, self.learner.device),
            )

        return experiences_1

    def train(self):
        """Train the agent."""
        # logger
        if self.is_log:
            self.set_wandb()
            # wandb.watch([self.dqn], log="parameters")

        # pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.episode_num + 1):
            state = self.env.reset()
            self.episode_step = 0
            losses = list()
            done = False
            score = 0

            t_begin = time.time()

            while not done:
                if self.is_render and self.i_episode >= self.render_after:
                    self.env.render()
                action = self.select_action(state)
                next_state, reward, done, _ = self.step(action)
                self.total_step += 1
                self.episode_step += 1

                if len(self.memory) >= self.hyper_params.update_starts_from:
                    if self.total_step % self.hyper_params.train_freq == 0:
                        for _ in range(self.hyper_params.multiple_update):
                            experience = self.sample_experience()
                            info = self.learner.update_model(experience)
                            loss = info[0:2]
                            indices, new_priorities = info[2:4]
                            losses.append(loss)  # for logging
                            self.memory.update_priorities(
                                indices, new_priorities)

                    # decrease epsilon
                    self.epsilon = max(
                        self.epsilon - (self.max_epsilon - self.min_epsilon) *
                        self.hyper_params.epsilon_decay,
                        self.min_epsilon,
                    )

                    # increase priority beta
                    fraction = min(
                        float(self.i_episode) / self.episode_num, 1.0)
                    self.per_beta = self.per_beta + fraction * (1.0 -
                                                                self.per_beta)

                state = next_state
                score += reward

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            if losses:
                avg_loss = np.vstack(losses).mean(axis=0)
                log_value = (self.i_episode, avg_loss, score, avg_time_cost)
                self.write_log(log_value)

            if self.i_episode % self.save_period == 0:
                self.learner.save_params(self.i_episode)
                self.interim_test()

        # termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()
Esempio n. 12
0
class R2D1Agent(DQNAgent):
    """R2D1 interacting with environment.

    Attribute:
        memory (RecurrentPrioritizedReplayBuffer): replay memory for recurrent agent
        memory_n (RecurrentReplayBuffer): nstep replay memory for recurrent agent
    """

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        if not self.is_test:

            self.memory = RecurrentReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                self.hyper_params.sequence_size,
                self.hyper_params.overlap_size,
                n_step=self.hyper_params.n_step,
                gamma=self.hyper_params.gamma,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha
            )

            # replay memory for multi-steps
            if self.use_n_step:
                self.memory_n = RecurrentReplayBuffer(
                    self.hyper_params.buffer_size,
                    self.hyper_params.batch_size,
                    self.hyper_params.sequence_size,
                    self.hyper_params.overlap_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner = build_learner(self.learner_cfg, build_args)

    def select_action(
        self,
        state: np.ndarray,
        hidden_state: torch.Tensor,
        prev_action: torch.Tensor,
        prev_reward: np.ndarray,
    ) -> np.ndarray:
        """Select an action from the input space."""
        self.curr_state = state

        # epsilon greedy policy
        state = self._preprocess_state(state)
        with torch.no_grad():
            selected_action, hidden_state = self.learner.dqn(
                state, hidden_state, prev_action, prev_reward
            )
        selected_action = selected_action.detach().argmax().cpu().numpy()

        if not self.is_test and self.epsilon > np.random.random():
            selected_action = np.array(self.env.action_space.sample())
        return selected_action, hidden_state

    def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
        """Add 1 step and n step transitions to memory."""
        # Add n-step transition
        if self.use_n_step:
            transition = self.memory_n.add(transition)

        # Add a single step transition
        # If transition is not an empty tuple
        if transition:
            self.memory.add(transition)

    def step(
        self, action: np.ndarray, hidden_state: torch.Tensor
    ) -> Tuple[np.ndarray, np.float64, bool, dict]:
        """Take an action and return the response of the env."""
        next_state, reward, done, info = self.env.step(action)
        if not self.is_test:
            # if the last state is not a terminal state, store done as false
            done_bool = False if self.episode_step == self.max_episode_steps else done

            transition = (
                self.curr_state,
                action,
                hidden_state.detach(),
                reward,
                next_state,
                done_bool,
            )
            self._add_transition_to_memory(transition)

        return next_state, reward, done, info

    def sample_experience(self) -> Tuple[torch.Tensor, ...]:
        experiences_1 = self.memory.sample(self.per_beta)
        experiences_1 = (
            numpy2floattensor(experiences_1[:3], self.learner.device)
            + (experiences_1[3],)
            + numpy2floattensor(experiences_1[4:6], self.learner.device)
            + (experiences_1[6:])
        )
        if self.use_n_step:
            indices = experiences_1[-2]
            experiences_n = self.memory_n.sample(indices)
            return (
                experiences_1,
                numpy2floattensor(experiences_n[:3], self.learner.device)
                + (experiences_n[3],)
                + numpy2floattensor(experiences_n[4:], self.learner.device),
            )

        return experiences_1

    def train(self):
        """Train the agent."""
        # Logger
        if self.is_log:
            self.set_wandb()
            # wandb.watch([self.dqn], log="parameters")

        # Pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.episode_num + 1):
            state = self.env.reset()
            hidden_in = torch.zeros(
                [1, 1, self.learner.gru_cfg.rnn_hidden_size], dtype=torch.float
            ).to(self.learner.device)
            prev_action = torch.zeros(
                1, 1, self.learner.head_cfg.configs.output_size
            ).to(self.learner.device)
            prev_reward = torch.zeros(1, 1, 1).to(self.learner.device)
            self.episode_step = 0
            self.sequence_step = 0
            losses = list()
            done = False
            score = 0

            t_begin = time.time()

            while not done:
                if self.is_render and self.i_episode >= self.render_after:
                    self.env.render()

                action, hidden_out = self.select_action(
                    state, hidden_in, prev_action, prev_reward
                )
                next_state, reward, done, _ = self.step(action, hidden_in)
                self.total_step += 1
                self.episode_step += 1

                if self.episode_step % self.hyper_params.sequence_size == 0:
                    self.sequence_step += 1

                if len(self.memory) >= self.hyper_params.update_starts_from:
                    if self.sequence_step % self.hyper_params.train_freq == 0:
                        for _ in range(self.hyper_params.multiple_update):
                            experience = self.sample_experience()
                            info = self.learner.update_model(experience)
                            loss = info[0:2]
                            indices, new_priorities = info[2:4]
                            losses.append(loss)  # For logging
                            self.memory.update_priorities(indices, new_priorities)

                    # Decrease epsilon
                    self.epsilon = max(
                        self.epsilon
                        - (self.max_epsilon - self.min_epsilon)
                        * self.hyper_params.epsilon_decay,
                        self.min_epsilon,
                    )

                    # Increase priority beta
                    fraction = min(float(self.i_episode) / self.episode_num, 1.0)
                    self.per_beta = self.per_beta + fraction * (1.0 - self.per_beta)

                hidden_in = hidden_out
                state = next_state
                prev_action = common_utils.make_one_hot(
                    torch.as_tensor(action), self.learner.head_cfg.configs.output_size
                )
                prev_reward = torch.as_tensor(reward).to(self.learner.device)
                score += reward

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            if losses:
                avg_loss = np.vstack(losses).mean(axis=0)
                log_value = (self.i_episode, avg_loss, score, avg_time_cost)
                self.write_log(log_value)

                if self.i_episode % self.save_period == 0:
                    self.learner.save_params(self.i_episode)
                    self.interim_test()

        # Termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()

    def _test(self, interim_test: bool = False):
        """Common test routine."""

        if interim_test:
            test_num = self.interim_test_num
        else:
            test_num = self.episode_num
        score_list = []
        for i_episode in range(test_num):
            hidden_in = torch.zeros(
                [1, 1, self.learner.gru_cfg.rnn_hidden_size], dtype=torch.float
            ).to(self.learner.device)
            prev_action = torch.zeros(
                1, 1, self.learner.head_cfg.configs.output_size
            ).to(self.learner.device)
            prev_reward = torch.zeros(1, 1, 1).to(self.learner.device)
            state = self.env.reset()
            done = False
            score = 0
            step = 0

            while not done:
                if self.is_render:
                    self.env.render()

                action, hidden_out = self.select_action(
                    state, hidden_in, prev_action, prev_reward
                )
                next_state, reward, done, _ = self.step(action, hidden_in)

                hidden_in = hidden_out
                state = next_state
                prev_action = common_utils.make_one_hot(
                    torch.as_tensor(action), self.learner.head_cfg.configs.output_size
                )
                prev_reward = torch.as_tensor(reward).to(self.learner.device)
                score += reward
                step += 1

            print(
                "[INFO] test %d\tstep: %d\ttotal score: %d" % (i_episode, step, score)
            )
            score_list.append(score)

        if self.is_log:
            wandb.log(
                {
                    "avg test score": round(sum(score_list) / len(score_list), 2),
                    "test total step": self.total_step,
                }
            )

    def test_with_saliency_map(self):
        """Test agent with saliency map."""
        saliency_map_dir = make_saliency_dir(self.args.load_from.split("/")[-2])
        print(f"Save saliency map in directory : {saliency_map_dir}")
        print("Saving saliency maps...")
        i = 0
        for i_episode in range(self.args.episode_num):
            hidden_in = torch.zeros(
                [1, 1, self.learner.gru_cfg.rnn_hidden_size], dtype=torch.float
            ).to(self.learner.device)
            prev_action = torch.zeros(
                1, 1, self.learner.head_cfg.configs.output_size
            ).to(self.learner.device)
            prev_reward = torch.zeros(1, 1, 1).to(self.learner.device)
            state = self.env.reset()
            done = False
            score = 0
            step = 0

            key = 0
            print("\nPress Any Key to move to next step... (quit: ESC key)")
            while not done:
                action, hidden_out = self.select_action(
                    state, hidden_in, prev_action, prev_reward
                )
                for param in self.learner.dqn.parameters():
                    param.requires_grad = False
                saliency_map = save_saliency_maps(
                    i,
                    (state, hidden_in, prev_action, prev_reward),
                    action,
                    self.learner.dqn,
                    self.learner.device,
                    saliency_map_dir,
                )
                i += 1
                next_state, reward, done, _ = self.step(action, hidden_in)

                state = np.transpose(state[-1])
                state = cv2.cvtColor(state, cv2.COLOR_GRAY2BGR)
                state = cv2.resize(state, (150, 150), interpolation=cv2.INTER_LINEAR)

                # Get Grad-CAM image
                result_images = None
                saliency_map = np.asarray(saliency_map)
                saliency_map = cv2.resize(
                    saliency_map, (150, 150), interpolation=cv2.INTER_LINEAR
                )
                saliency_map = cv2.cvtColor(saliency_map, cv2.COLOR_RGBA2BGR)
                overlay = cv2.addWeighted(state, 1.0, saliency_map, 0.5, 0)
                result = np.hstack([state, saliency_map, overlay])
                result_images = (
                    result
                    if result_images is None
                    else np.vstack([result_images, result])
                )
                # Show action on result image
                cv2.putText(
                    img=result_images,
                    text=f"action: {action}",
                    org=(50, 50),
                    fontFace=cv2.FONT_HERSHEY_PLAIN,
                    fontScale=1,
                    color=(0, 0, 255),
                    thickness=2,
                )

                cv2.imshow("result", result_images)
                key = cv2.waitKey(0)
                if key == 27 & 0xFF:  # ESC key
                    cv2.destroyAllWindows()
                    break

                state = next_state
                hidden_in = hidden_out
                prev_action = common_utils.make_one_hot(
                    torch.as_tensor(action), self.learner.head_cfg.configs.output_size
                )
                prev_reward = torch.as_tensor(reward).to(self.learner.device)
                score += reward
                step += 1

            print(
                "[INFO] test %d\tstep: %d\ttotal score: %d" % (i_episode, step, score)
            )
            if key == 27 & 0xFF:  # ESC key
                break
Esempio n. 13
0
class DQfDAgent(DQNAgent):
    """DQN interacting with environment.

    Attribute:
        memory (PrioritizedReplayBuffer): replay memory

    """

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        if not self.args.test:
            # load demo replay memory
            demos = self._load_demos()

            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory
            self.memory = ReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                demo=demos,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory,
                alpha=self.hyper_params.per_alpha,
                epsilon_d=self.hyper_params.per_eps_demo,
            )

        self.learner_cfg.type = "DQfDLearner"
        self.learner = build_learner(self.learner_cfg)

    def _load_demos(self) -> list:
        """Load expert's demonstrations."""
        # load demo replay memory
        with open(self.args.demo_path, "rb") as f:
            demos = pickle.load(f)

        return demos

    def write_log(self, log_value: tuple):
        """Write log about loss and score"""
        i, avg_loss, score, avg_time_cost = log_value
        print(
            "[INFO] episode %d, episode step: %d, total step: %d, total score: %f\n"
            "epsilon: %f, total loss: %f, dq loss: %f, supervised loss: %f\n"
            "avg q values: %f, demo num in minibatch: %d (spent %.6f sec/step)\n"
            % (
                i,
                self.episode_step,
                self.total_step,
                score,
                self.epsilon,
                avg_loss[0],
                avg_loss[1],
                avg_loss[2],
                avg_loss[3],
                avg_loss[4],
                avg_time_cost,
            ))

        if self.args.log:
            wandb.log({
                "score": score,
                "epsilon": self.epsilon,
                "total loss": avg_loss[0],
                "dq loss": avg_loss[1],
                "supervised loss": avg_loss[2],
                "avg q values": avg_loss[3],
                "demo num in minibatch": avg_loss[4],
                "time per each step": avg_time_cost,
            })

    def pretrain(self):
        """Pretraining steps."""
        pretrain_loss = list()
        pretrain_step = self.hyper_params.pretrain_step
        print("[INFO] Pre-Train %d step." % pretrain_step)
        for i_step in range(1, pretrain_step + 1):
            t_begin = time.time()
            experience = self.sample_experience()
            info = self.learner.update_model(experience)
            loss = info[0:5]
            t_end = time.time()
            pretrain_loss.append(loss)  # for logging

            # logging
            if i_step == 1 or i_step % 100 == 0:
                avg_loss = np.vstack(pretrain_loss).mean(axis=0)
                pretrain_loss.clear()
                log_value = (0, avg_loss, 0.0, t_end - t_begin)
                self.write_log(log_value)
        print("[INFO] Pre-Train Complete!\n")

    def train(self):
        """Train the agent."""
        # logger
        if self.args.log:
            self.set_wandb()
            # wandb.watch([self.dqn], log="parameters")

        # pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.args.episode_num + 1):
            state = self.env.reset()
            self.episode_step = 0
            losses = list()
            done = False
            score = 0

            t_begin = time.time()

            while not done:
                if self.args.render and self.i_episode >= self.args.render_after:
                    self.env.render()

                action = self.select_action(state)
                next_state, reward, done, _ = self.step(action)
                self.total_step += 1
                self.episode_step += 1

                if len(self.memory) >= self.hyper_params.update_starts_from:
                    if self.total_step % self.hyper_params.train_freq == 0:
                        for _ in range(self.hyper_params.multiple_update):
                            experience = self.sample_experience()
                            info = self.learner.update_model(experience)
                            loss = info[0:5]
                            indices, new_priorities = info[5:7]
                            losses.append(loss)  # for logging
                            self.memory.update_priorities(
                                indices, new_priorities)

                    # decrease epsilon
                    self.epsilon = max(
                        self.epsilon - (self.max_epsilon - self.min_epsilon) *
                        self.hyper_params.epsilon_decay,
                        self.min_epsilon,
                    )

                    # increase priority beta
                    fraction = min(
                        float(self.i_episode) / self.args.episode_num, 1.0)
                    self.per_beta = self.per_beta + fraction * (1.0 -
                                                                self.per_beta)

                state = next_state
                score += reward

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            if losses:
                avg_loss = np.vstack(losses).mean(axis=0)
                log_value = (self.i_episode, avg_loss, score, avg_time_cost)
                self.write_log(log_value)

            if self.i_episode % self.args.save_period == 0:
                self.learner.save_params(self.i_episode)
                self.interim_test()

        # termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()
Esempio n. 14
0
class SACfDAgent(SACAgent):
    """SAC agent interacting with environment.

    Attrtibutes:
        memory (PrioritizedReplayBuffer): replay memory
        beta (float): beta parameter for prioritized replay buffer
        use_n_step (bool): whether or not to use n-step returns

    """

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        self.per_beta = self.hyper_params.per_beta
        self.use_n_step = self.hyper_params.n_step > 1

        if not self.is_test:
            # load demo replay memory
            with open(self.hyper_params.demo_path, "rb") as f:
                demos = pickle.load(f)

                #pdb.set_trace()
            if self.use_n_step:
                demos, demos_n_step = common_utils.get_n_step_info_from_demo(
                    demos, self.hyper_params.n_step, self.hyper_params.gamma)

                # replay memory for multi-steps
                self.memory_n = ReplayBuffer(
                    max_len=self.hyper_params.buffer_size,
                    batch_size=self.hyper_params.batch_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                    demo=demos_n_step,
                )

            # replay memory
            self.memory = ReplayBuffer(self.hyper_params.buffer_size,
                                       self.hyper_params.batch_size,
                                       demo=demos)
            self.memory = PrioritizedBufferWrapper(
                self.memory,
                alpha=self.hyper_params.per_alpha,
                epsilon_d=self.hyper_params.per_eps_demo,
            )

        build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.shape[0],
            is_test=self.is_test,
            load_from=self.load_from,
        )
        self.learner = build_learner(self.learner_cfg, build_args)

    def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
        """Add 1 step and n step transitions to memory."""
        # add n-step transition
        if self.use_n_step:
            transition = self.memory_n.add(transition)

        # add a single step transition
        # if transition is not an empty tuple
        if transition:
            self.memory.add(transition)

    def sample_experience(self) -> Tuple[torch.Tensor, ...]:
        experiences_1 = self.memory.sample(self.per_beta)
        experiences_1 = (common_utils.numpy2floattensor(
            experiences_1[:6], self.learner.device) + experiences_1[6:])
        if self.use_n_step:
            indices = experiences_1[-2]
            experiences_n = self.memory_n.sample(indices)
            return (
                experiences_1,
                common_utils.numpy2floattensor(experiences_n,
                                               self.learner.device),
            )

        return experiences_1

    def pretrain(self):
        """Pretraining steps."""
        pretrain_loss = list()
        pretrain_step = self.hyper_params.pretrain_step
        print("[INFO] Pre-Train %d steps." % pretrain_step)
        for i_step in range(1, pretrain_step + 1):
            t_begin = time.time()
            experience = self.sample_experience()
            info = self.learner.update_model(experience)
            loss = info[0:5]
            t_end = time.time()
            pretrain_loss.append(loss)  # for logging

            # logging
            if i_step == 1 or i_step % 100 == 0:
                avg_loss = np.vstack(pretrain_loss).mean(axis=0)
                pretrain_loss.clear()
                log_value = (
                    0,
                    avg_loss,
                    0,
                    self.hyper_params.policy_update_freq,
                    t_end - t_begin,
                )
                self.write_log(log_value)
        print("[INFO] Pre-Train Complete!\n")

    def train(self):
        """Train the agent."""
        # logger
        if self.is_log:
            self.set_wandb()
            # wandb.watch([self.actor, self.vf, self.qf_1, self.qf_2], log="parameters")

        # pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.episode_num + 1):
            state = self.env.reset()
            done = False
            score = 0
            self.episode_step = 0
            loss_episode = list()

            t_begin = time.time()

            while not done:
                #if self.is_render and self.i_episode >= self.render_after:
                #self.env.render()

                action = self.select_action(state)
                next_state, reward, done, _ = self.step(action)
                self.total_step += 1
                self.episode_step += 1

                state = next_state
                score += reward

                # training
                if len(self.memory) >= self.hyper_params.batch_size:
                    for _ in range(self.hyper_params.multiple_update):
                        experience = self.sample_experience()
                        info = self.learner.update_model(experience)
                        loss = info[0:5]
                        indices, new_priorities = info[5:7]
                        loss_episode.append(loss)  # for logging
                        self.memory.update_priorities(indices, new_priorities)

                # increase priority beta
                fraction = min(float(self.i_episode) / self.episode_num, 1.0)
                self.per_beta = self.per_beta + fraction * (1.0 -
                                                            self.per_beta)

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            # logging
            if loss_episode:
                avg_loss = np.vstack(loss_episode).mean(axis=0)
                log_value = (
                    self.i_episode,
                    avg_loss,
                    score,
                    self.hyper_params.policy_update_freq,
                    avg_time_cost,
                )
                self.write_log(log_value)

            if self.i_episode % self.save_period == 0:
                self.learner.save_params(self.i_episode)
                self.interim_test()

        # termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()
Esempio n. 15
0
class R2D1Agent(DQNAgent):
    """R2D1 interacting with environment.

    Attribute:
        memory (RecurrentPrioritizedReplayBuffer): replay memory for recurrent agent
        memory_n (RecurrentReplayBuffer): nstep replay memory for recurrent agent
    """

    # pylint: disable=attribute-defined-outside-init
    def _initialize(self):
        """Initialize non-common things."""
        if not self.args.test:

            self.memory = RecurrentReplayBuffer(
                self.hyper_params.buffer_size,
                self.hyper_params.batch_size,
                self.hyper_params.sequence_size,
                self.hyper_params.overlap_size,
                n_step=self.hyper_params.n_step,
                gamma=self.hyper_params.gamma,
            )
            self.memory = PrioritizedBufferWrapper(
                self.memory, alpha=self.hyper_params.per_alpha)

            # replay memory for multi-steps
            if self.use_n_step:
                self.memory_n = RecurrentReplayBuffer(
                    self.hyper_params.buffer_size,
                    self.hyper_params.batch_size,
                    self.hyper_params.sequence_size,
                    self.hyper_params.overlap_size,
                    n_step=self.hyper_params.n_step,
                    gamma=self.hyper_params.gamma,
                )

        self.learner = build_learner(self.learner_cfg)

    def select_action(
        self,
        state: np.ndarray,
        hidden_state: torch.Tensor,
        prev_action: torch.Tensor,
        prev_reward: np.ndarray,
    ) -> np.ndarray:
        """Select an action from the input space."""
        self.curr_state = state

        # epsilon greedy policy
        state = self._preprocess_state(state)
        with torch.no_grad():
            selected_action, hidden_state = self.learner.dqn(
                state, hidden_state, prev_action, prev_reward)
        selected_action = selected_action.detach().argmax().cpu().numpy()

        if not self.args.test and self.epsilon > np.random.random():
            selected_action = np.array(self.env.action_space.sample())
        return selected_action, hidden_state

    def _add_transition_to_memory(self, transition: Tuple[np.ndarray, ...]):
        """Add 1 step and n step transitions to memory."""
        # Add n-step transition
        if self.use_n_step:
            transition = self.memory_n.add(transition)

        # Add a single step transition
        # If transition is not an empty tuple
        if transition:
            self.memory.add(transition)

    def step(
        self, action: np.ndarray, hidden_state: torch.Tensor
    ) -> Tuple[np.ndarray, np.float64, bool, dict]:
        """Take an action and return the response of the env."""
        next_state, reward, done, info = self.env.step(action)
        if not self.args.test:
            # if the last state is not a terminal state, store done as false
            done_bool = (False if self.episode_step
                         == self.args.max_episode_steps else done)

            transition = (
                self.curr_state,
                action,
                hidden_state.detach(),
                reward,
                next_state,
                done_bool,
            )
            self._add_transition_to_memory(transition)

        return next_state, reward, done, info

    def sample_experience(self) -> Tuple[torch.Tensor, ...]:
        experiences_1 = self.memory.sample(self.per_beta)
        experiences_1 = (numpy2floattensor(experiences_1[:3]) +
                         (experiences_1[3], ) +
                         numpy2floattensor(experiences_1[4:6]) +
                         (experiences_1[6:]))
        if self.use_n_step:
            indices = experiences_1[-2]
            experiences_n = self.memory_n.sample(indices)
            return (
                experiences_1,
                numpy2floattensor(experiences_n[:3]) + (experiences_n[3], ) +
                numpy2floattensor(experiences_n[4:]),
            )

        return experiences_1

    def train(self):
        """Train the agent."""
        # Logger
        if self.args.log:
            self.set_wandb()
            # wandb.watch([self.dqn], log="parameters")

        # Pre-training if needed
        self.pretrain()

        for self.i_episode in range(1, self.args.episode_num + 1):
            state = self.env.reset()
            hidden_in = torch.zeros(
                [1, 1, self.learner.gru_cfg.rnn_hidden_size],
                dtype=torch.float).to(device)
            prev_action = torch.zeros(
                1, 1, self.learner.head_cfg.configs.output_size).to(device)
            prev_reward = torch.zeros(1, 1, 1).to(device)
            self.episode_step = 0
            self.sequence_step = 0
            losses = list()
            done = False
            score = 0

            t_begin = time.time()

            while not done:
                if self.args.render and self.i_episode >= self.args.render_after:
                    self.env.render()

                action, hidden_out = self.select_action(
                    state, hidden_in, prev_action, prev_reward)
                next_state, reward, done, _ = self.step(action, hidden_in)
                self.total_step += 1
                self.episode_step += 1

                if self.episode_step % self.hyper_params.sequence_size == 0:
                    self.sequence_step += 1

                if len(self.memory) >= self.hyper_params.update_starts_from:
                    if self.sequence_step % self.hyper_params.train_freq == 0:
                        for _ in range(self.hyper_params.multiple_update):
                            experience = self.sample_experience()
                            info = self.learner.update_model(experience)
                            loss = info[0:2]
                            indices, new_priorities = info[2:4]
                            losses.append(loss)  # For logging
                            self.memory.update_priorities(
                                indices, new_priorities)

                    # Decrease epsilon
                    self.epsilon = max(
                        self.epsilon - (self.max_epsilon - self.min_epsilon) *
                        self.hyper_params.epsilon_decay,
                        self.min_epsilon,
                    )

                    # Increase priority beta
                    fraction = min(
                        float(self.i_episode) / self.args.episode_num, 1.0)
                    self.per_beta = self.per_beta + fraction * (1.0 -
                                                                self.per_beta)

                hidden_in = hidden_out
                state = next_state
                prev_action = common_utils.make_one_hot(
                    torch.as_tensor(action),
                    self.learner.head_cfg.configs.output_size)
                prev_reward = torch.as_tensor(reward).to(device)
                score += reward

            t_end = time.time()
            avg_time_cost = (t_end - t_begin) / self.episode_step

            if losses:
                avg_loss = np.vstack(losses).mean(axis=0)
                log_value = (self.i_episode, avg_loss, score, avg_time_cost)
                self.write_log(log_value)

                if self.i_episode % self.args.save_period == 0:
                    self.learner.save_params(self.i_episode)
                    self.interim_test()

        # Termination
        self.env.close()
        self.learner.save_params(self.i_episode)
        self.interim_test()

    def _test(self, interim_test: bool = False):
        """Common test routine."""

        if interim_test:
            test_num = self.args.interim_test_num
        else:
            test_num = self.args.episode_num
        score_list = []
        for i_episode in range(test_num):
            hidden_in = torch.zeros(
                [1, 1, self.learner.gru_cfg.rnn_hidden_size],
                dtype=torch.float).to(device)
            prev_action = torch.zeros(
                1, 1, self.learner.head_cfg.configs.output_size).to(device)
            prev_reward = torch.zeros(1, 1, 1).to(device)
            state = self.env.reset()
            done = False
            score = 0
            step = 0

            while not done:
                if self.args.render:
                    self.env.render()

                action, hidden_out = self.select_action(
                    state, hidden_in, prev_action, prev_reward)
                next_state, reward, done, _ = self.step(action, hidden_in)

                hidden_in = hidden_out
                state = next_state
                prev_action = common_utils.make_one_hot(
                    torch.as_tensor(action),
                    self.learner.head_cfg.configs.output_size)
                prev_reward = torch.as_tensor(reward).to(device)
                score += reward
                step += 1

            print("[INFO] test %d\tstep: %d\ttotal score: %d" %
                  (i_episode, step, score))
            score_list.append(score)

        if self.args.log:
            wandb.log({
                "avg test score":
                round(sum(score_list) / len(score_list), 2),
                "test total step":
                self.total_step,
            })
Esempio n. 16
0
    def _spawn(self):
        """Intialize distributed worker, learner and centralized replay buffer."""
        replay_buffer = ReplayBuffer(
            self.hyper_params.buffer_size,
            self.hyper_params.batch_size,
        )
        per_buffer = PrioritizedBufferWrapper(
            replay_buffer, alpha=self.hyper_params.per_alpha)
        self.global_buffer = ApeXBufferWrapper.remote(per_buffer,
                                                      self.hyper_params,
                                                      self.comm_cfg)

        # Build learner
        learner_build_args = dict(
            hyper_params=self.hyper_params,
            log_cfg=self.log_cfg,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_test=self.is_test,
            load_from=self.load_from,
        )
        learner = build_learner(self.learner_cfg, learner_build_args)
        self.learner = ApeXLearnerWrapper.remote(learner, self.comm_cfg)

        # Build workers
        state_dict = learner.get_state_dict()
        worker_build_args = dict(
            hyper_params=self.hyper_params,
            backbone=self.learner_cfg.backbone,
            head=self.learner_cfg.head,
            loss_type=self.learner_cfg.loss_type,
            state_dict=state_dict,
            env_name=self.env_info.name,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            is_atari=self.env_info.is_atari,
            max_episode_steps=self.max_episode_steps,
        )
        self.workers = []
        self.num_workers = self.hyper_params.num_workers
        for rank in range(self.num_workers):
            worker_build_args["rank"] = rank
            worker = build_worker(self.worker_cfg,
                                  build_args=worker_build_args)
            apex_worker = ApeXWorkerWrapper.remote(worker, self.comm_cfg)
            self.workers.append(apex_worker)

        # Build logger
        logger_build_args = dict(
            log_cfg=self.log_cfg,
            comm_cfg=self.comm_cfg,
            backbone=self.learner_cfg.backbone,
            head=self.learner_cfg.head,
            env_name=self.env_info.name,
            is_atari=self.env_info.is_atari,
            state_size=self.env_info.observation_space.shape,
            output_size=self.env_info.action_space.n,
            max_update_step=self.hyper_params.max_update_step,
            episode_num=self.episode_num,
            max_episode_steps=self.max_episode_steps,
            is_log=self.is_log,
            is_render=self.is_render,
            interim_test_num=self.interim_test_num,
        )

        self.logger = build_logger(self.logger_cfg, logger_build_args)

        self.processes = self.workers + [
            self.learner, self.global_buffer, self.logger
        ]