Exemple #1
0
class Agent():
    def __init__(self,
                 state_size,
                 action_size,
                 param={},
                 level_name='general'):
        self.seed = 714
        np.random.seed(seed=self.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.dummy_adv = np.zeros((1, 1))
        self.dummy_actions_prob = np.zeros((1, action_size))
        self.actor = Actor(state_size, action_size)
        self.critic = Critic(state_size, action_size)
        self.level_name = level_name
        timestampe = datetime.datetime.now().strftime("%Y_%m_%d_%H%M")
        self.writer = SummaryWriter('logs/%s/%s' %
                                    (self.level_name, timestampe))
        self.best_weight_fn = 'ppo_best_%s_' + level_name.lower().replace(
            '.', '_') + '.h5'
        self.memory = [[], [], [], []]
        self.update_count = 0
        self.cur_ind = 0
        self.GAMMA = 0.99

        self.EXPERIENCE_REPLAY = param.get('EXPERIENCE_REPLAY', False)
        if self.EXPERIENCE_REPLAY is True:
            self.BUFFER_SIZE = param['BUFFER_SIZE']
            self.BATCH_SIZE = param['BATCH_SIZE']
            self.buffer = ReplayBuffer(self.action_size, self.BUFFER_SIZE,
                                       self.BATCH_SIZE, self.seed)

    def get_memory_size(self):
        return len(self.memory[0])

    def get_batch(self, batch_size):
        start, end = self.cur_ind, self.cur_ind + batch_size
        self.cur_ind += batch_size
        if end >= self.get_memory_size():
            end = self.get_memory_size()
            self.cur_ind = 0
            batch_size = end - start
        state = np.array(self.memory[0][start:end])
        action_took = np.array(self.memory[1][start:end])
        old_actions_prob = np.array(self.memory[2][start:end])
        reward = np.array(self.memory[3][start:end]).reshape(batch_size, 1)

        if self.cur_ind == 0:
            self.reset_memory()

        return state, action_took, old_actions_prob, reward, batch_size

    def reset_memory(self):
        del self.memory
        self.memory = [[], [], [], []]

    def step(self, state, action_took, actions_prob, reward):
        self.memory[0].append(state)
        self.memory[1].append(action_took)
        self.memory[2].append(actions_prob)
        self.memory[3].append(reward)

    def act(self, state, test=False):
        actions_prob = None
        if test is True:
            actions_prob = self.actor.model.predict(np.array([state]))
        else:
            actions_prob = self.actor.model.predict(
                [np.array([state]), self.dummy_adv, self.dummy_actions_prob])
        action = np.random.choice(self.action_size,
                                  p=np.nan_to_num(actions_prob[0]))
        action_took = np.zeros(self.action_size)
        action_took[action] = 1
        return action, action_took, actions_prob[0]

    def compute_decay_reward(self):
        memory_size = self.get_memory_size()

        self.memory[0] = self.memory[0]
        self.memory[1] = np.array(self.memory[1])
        self.memory[2] = np.array(self.memory[2])
        self.memory[3] = np.array(self.memory[3])

        for t in range(memory_size - 2, -1, -1):
            # timestep t
            # reward = r(t) + \sum_{t'}
            self.memory[3][
                t] = self.memory[3][t] + self.memory[3][t + 1] * self.GAMMA

        if self.EXPERIENCE_REPLAY is True:
            self.buffer.adds(self.memory[0], self.memory[1], self.memory[2],
                             self.memory[3])

    def learn(self, batch_size, i_epoch):
        """
            batch: state, action, actions_prob, reward
        """

        while self.get_memory_size() != 0:
            state, action_took, old_actions_prob, reward, batch_size = self.get_batch(
                batch_size)
            if batch_size == 0:
                break
            advantage = self.critic.model.predict(state)
            advantage = reward - advantage
            actor_loss = self.actor.model.fit(
                [state, advantage, old_actions_prob], [action_took],
                batch_size=batch_size,
                shuffle=True,
                epochs=i_epoch,
                verbose=False)
            critic_loss = self.critic.model.fit([state], [reward],
                                                batch_size=batch_size,
                                                shuffle=True,
                                                epochs=i_epoch,
                                                verbose=False)
            self.writer.add_scalar('Actor loss',
                                   actor_loss.history['loss'][-1],
                                   self.update_count)
            self.writer.add_scalar('Critic loss',
                                   critic_loss.history['loss'][-1],
                                   self.update_count)

            self.update_count += 1

    def learn_from_buffer(self, batch_size, i_epoch):
        """
            batch: state, action, actions_prob, reward
        """

        if self.EXPERIENCE_REPLAY is True and len(self.buffer) > batch_size:
            experiences = self.buffer.sample()
            state, action_took, reward, old_actions_prob = experiences
            advantage = self.critic.model.predict(state)
            advantage = reward - advantage
            actor_loss = self.actor.model.fit(
                [state, advantage, old_actions_prob], [action_took],
                batch_size=batch_size,
                shuffle=True,
                epochs=i_epoch,
                verbose=False)
            critic_loss = self.critic.model.fit([state], [reward],
                                                batch_size=batch_size,
                                                shuffle=True,
                                                epochs=i_epoch,
                                                verbose=False)
            self.writer.add_scalar('Actor loss',
                                   actor_loss.history['loss'][-1],
                                   self.update_count)
            self.writer.add_scalar('Critic loss',
                                   critic_loss.history['loss'][-1],
                                   self.update_count)

            self.update_count += 1

    def save_model(self, name=None):
        if name is None:
            actor_name = self.best_weight_fn % 'actor'
            critic_name = self.best_weight_fn % 'critic'
        else:
            actor_name = name + 'actor' + '.h5'
            critic_name = name + 'critic' + '.h5'

        self.actor.save_model(actor_name)
        self.critic.save_model(critic_name)

    def load_model(self, actor_model_fn, critic_model_fn):
        self.actor.load_model(actor_model_fn)
        self.critic.load_model(critic_model_fn)