Exemple #1
0
class ActorCriticAgent:
    """ Advantage Actor Critic agent """
    def __init__(self, num_actions, checkpoint=None):
        self.network, self.trainable_parameters = self.init_network(
            num_actions)
        self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4)
        self.memory = Memory()
        if checkpoint is not None:
            load_checkpoint(self.network, self.optimizer, checkpoint)

    def init_network(self, num_actions):

        network = {'actor_critic': ActorCritic(num_actions)}
        trainable_parameters = list(network['actor_critic'].parameters())
        return network, trainable_parameters

    def play(self,
             environment,
             max_games=1,
             max_steps=500,
             train=False,
             verbose=False,
             recorder=None):

        n_steps = 0
        n_games = 0
        current_game_infos = {
            'game': n_games + 1,
            'reward': 0,
            'game_duration': 0
        }
        observation = environment.reset()
        if recorder is not None:
            recorder.reset()
            recorder.record(environment)

        while (n_steps < max_steps) and (n_games < max_games):

            self.init_rollout(observation)
            for rollout_step in range(20):

                value, log_policy, action = self.network['actor_critic'](
                    observation)
                self.memory.append({
                    'value': value,
                    'log_policy': log_policy,
                    'action': action
                })

                observation, extrinsic_reward, is_game_over, infos = environment.step(
                    action.numpy()[0])
                if recorder is not None:
                    recorder.record(environment)

                reward = self.get_reward(observation, extrinsic_reward)
                self.memory.append({'reward': reward})

                current_game_infos['reward'] += reward
                current_game_infos['game_duration'] += 1
                n_steps += 1

                if is_game_over:
                    n_games += 1
                    print(current_game_infos)
                    current_game_infos = {
                        'game': n_games + 1,
                        'reward': 0,
                        'game_duration': 0
                    }
                    observation = environment.reset()
                    break

            self.end_rollout(observation, is_game_over)
            if verbose:
                print(current_game_infos)

            if train:
                loss = self.compute_loss()
                self.backpropagate(loss)

        if recorder is not None:
            recorder.stop()

    def init_rollout(self, observation):

        self.memory.reset()
        self.network['actor_critic'].detach_internal_state()

    def end_rollout(self, observation, is_game_over):

        if is_game_over:
            next_value = torch.Tensor([[0]])
            self.network['actor_critic'].reset_internal_state()
        else:
            next_value = self.network['actor_critic'](observation)[0].detach()
        self.memory.append({'value': next_value})

    def get_reward(self, observation, extrinsic_reward):

        return np.clip(extrinsic_reward, -1, 1)

    def compute_loss(self):

        loss = self.network['actor_critic'].loss(self.memory)
        return loss

    def backpropagate(self, loss, max_gradient_norm=40):

        self.optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.trainable_parameters,
                                       max_gradient_norm)
        self.optimizer.step()
    def run(self):
        total_time = time.time()
        steps = 0

        try:
            save_interval = 0

            memory = Memory()

            do_training = True

            while do_training:
                episode_time = time.time()

                self.env.initial_position = {
                    'p_x': np.random.uniform(1, 4),
                    'p_y': 3.7,
                    'p_z': 0.05,
                    'o_x': 0,
                    'o_y': 0.0,
                    'o_z': np.random.uniform(0.4, 1),
                    'o_w': 0.855
                }
                state = self.env.reset()
                state = preprocess(state, self.img_y_offset, self.img_x_scale,
                                   self.img_y_scale)
                #                 state = np.expand_dims(state, axis=0)

                done = False
                cumulated_reward = 0

                stacked_states = deque(maxlen=self.n_frames)
                stacked_next_states = deque(maxlen=self.n_frames)
                for i in range(self.n_frames):
                    stacked_states.append(state)
                    stacked_next_states.append(state)

                episode_steps = 0
                while not done:
                    steps += 1
                    episode_steps += 1

                    action = self.agent.act(
                        np.expand_dims(np.stack(stacked_states, axis=2),
                                       axis=0))

                    next_state, reward, done, _ = self.env.step(action)
                    next_state = preprocess(next_state, self.img_y_offset,
                                            self.img_x_scale, self.img_y_scale)
                    stacked_next_states.append(next_state)

                    memory.append(action, np.stack(stacked_states, axis=2),
                                  np.stack(stacked_next_states, axis=2),
                                  reward, done)
                    stacked_states.append(next_state)

                    cumulated_reward += reward

                    save_interval += 1
                    if save_interval >= self.sample_batch_size:
                        save_interval = 0
                        replay_time = time.time()
                        self.agent.replay(memory)
                        rospy.loginfo("Replay time {}".format(time.time() -
                                                              replay_time))
                        if steps >= self.sample_batch_size * 200:
                            do_training = False
                        memory = Memory()

                if self.highest_reward < cumulated_reward:
                    self.highest_reward = cumulated_reward

                rospy.loginfo("total episode_steps {}, reward {}/{}".format(
                    episode_steps, cumulated_reward, self.highest_reward))
                rospy.loginfo("Episode time {}, total {}".format(
                    self.format_time(episode_time),
                    self.format_time(total_time)))
                rospy.loginfo("exploration_rate {}".format(
                    self.agent.exploration_rate))

        finally:
            self.env.close()
            rospy.loginfo("Total time: {}".format(
                self.format_time(total_time)))
            rospy.loginfo("Total steps: {}".format(steps))