Python AgentMemory.add Examples

Programming Language: Python

Namespace/Package Name: memory

Class/Type: AgentMemory

Method/Function: add

Examples at hotexamples.com: 2

Python AgentMemory.add - 2 examples found. These are the top rated real world Python examples of memory.AgentMemory.add extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

AgentMemory(8)

remember(3)

add_chat(2)

task_stack_peek(2)

add(2)

recall_batch(1)

task_stack_push(1)

set_mob_position(1)

sample(1)

get_player_struct_by_name(1)

on_block_changed(1)

get_player_by_name(1)

get_object_by_id(1)

get_mob_by_eid(1)

get_latest(1)

get_block_object_ids_by_xyz(1)

update(1)

Example #1

Show file

File: ddpg_controller.py Project: guillaumeboniface/reacher

class DDPGController:
    """
    Deep learning agent based on Deep Deterministic Policy Gradient described in https://arxiv.org/pdf/1509.02971.pdf
    
    """
    def __init__(self, env, brain_name, config):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'batch_size', int, size of the batches sampled to train the model on each update
        - 'update_every', int, update frequency, in number of steps
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.trained_policy = Policy(config, self.state_size, self.action_size)
        self.target_policy = Policy(config, self.state_size, self.action_size)
        self.trained_critic = Critic(config, self.state_size, self.action_size)
        self.target_critic = Critic(config, self.state_size, self.action_size)
        # those networks will never be trained
        self.target_policy.eval()
        self.target_critic.eval()
        self.memory = AgentMemory(((self.num_agents, self.state_size),
                                   (self.num_agents, self.action_size),
                                   (self.num_agents, self.state_size),
                                   (self.num_agents, ), (self.num_agents, )),
                                  int(self.max_memory))
        self.scores = []
        self.critic_losses = []
        self.surrogates = []

        self.critic_optimizer = optim.Adam(self.trained_critic.parameters(),
                                           lr=config.learning_rate)
        self.policy_optimizer = optim.Adam(self.trained_policy.parameters(),
                                           lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            critic_losses = []
            while True:
                action = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add((state, action, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if self.memory.size >= self.batch_size and not step % self.update_every:
                    surrogate_buffer, critic_loss = self.train()
                    surrogates.append(surrogate_buffer)
                    critic_losses.append(critic_loss)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))
            self.critic_losses.append(np.mean(critic_losses))

            self.print_status(i_episode)

        return self.scores, self.surrogates, self.critic_losses

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.trained_policy.eval()
        with torch.no_grad():
            actions = self.trained_policy(states)
        # TODO: add exploration noise
        return actions.cpu().data.numpy()

    def train(self):
        """
        Training routine to update the policy and critic

        """
        states, actions, next_states, rewards, dones = self.memory.sample(
            self.batch_size)

        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        rewards = torch.from_numpy(rewards).float().to(device)
        dones = torch.from_numpy(dones).float().to(device)

        # critic update
        next_actions = self.target_policy(next_states)
        self.trained_critic.train()
        self.critic_optimizer.zero_grad()
        done_mask = 1 - dones
        target_states_values = rewards + self.gamma * \
            self.target_critic(next_states, next_actions) * done_mask
        predicted_states_values = self.trained_critic(states, actions)
        critic_loss = torch.mean(
            (target_states_values - predicted_states_values)**2)
        critic_loss.backward()
        self.critic_optimizer.step()

        # policy update
        self.trained_policy.train()
        self.policy_optimizer.zero_grad()
        action_values = self.trained_critic(states,
                                            self.trained_policy(states))
        surrogate = -torch.mean(action_values)
        surrogate.backward()
        self.policy_optimizer.step()

        self.target_network_update(self.trained_critic, self.target_critic)
        self.target_network_update(self.trained_policy, self.target_policy)

        return surrogate.cpu().data.numpy(), critic_loss.cpu().data.numpy()

    def target_network_update(self, trained_model, target_model):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = target_model.get_weights()
        train_model_weights = trained_model.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        target_model.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Surrogate: %.5f | Critic loss: %.5f  "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1], self.critic_losses[-1]),
            end="")
        sys.stdout.flush()

Example #2

Show file

class PPOController:
    """
    Deep learning agent based on Proximal Policy Optimization, based on https://arxiv.org/pdf/1506.02438.pdf

    """
    def __init__(self, env, brain_name, config, policy=None, critic=None):
        """
        Constructor methods to create the controller

        Parameters
        ----------
        env - Unity environment for the agent to solve
        brain_name, string, brain name used in conjunction with the environment
        config - Dictionary containing the following keys:
        - 'num_episodes', int, number of episodes to run the agent for
        - 'epsilon_start', float, initial value for epsilon used in the PPO algorithm to clip the surrogate
        - 'epsilon_decay', float, rate of decay for epsilon, applied after every episode
        - 'gamma', float, discount rate for future rewards
        - 'tau', float, rate for the soft update of the target network
        - 'max_memory', int, size of the replay buffer in number of samples
        - 'update_every', int, update frequency, in number of steps
        - 'train_iterations', int, number of training passes over a data batch
        - 'mlp_layers', int tuple, shape of the multilayer perceptron model
        - 'learning_rate', float, learning rate for the training of the model
        - 'std', float, standard deviation used for the Normal distribution of the policy
        - 'state_size', int
        - 'action_size', int
        - 'num_agents', int, number of agents running in parallel in the environment

        - 'policy', optional, used to pass a mock policy for testing purposes
        - 'critic', optional, used to pass a mock critic for testing purposes

        """
        self.env = env
        self.brain_name = brain_name
        self.__dict__.update(config.as_dict())
        self.policy = Policy(config, self.state_size,
                             self.action_size) if policy is None else policy
        self.trained_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic = Critic(
            config, self.state_size) if critic is None else critic
        self.target_critic.eval()
        self.memory = AgentMemory(
            ((self.num_agents, self.state_size),
             (self.num_agents, self.action_size), (self.num_agents, ),
             (self.num_agents, self.state_size), (self.num_agents, ),
             (self.num_agents, )), int(self.max_memory))
        self.epsilon = config.epsilon_start
        self.scores = []
        self.surrogates = []

        self.optimizer = optim.Adam([{
            'params': self.policy.parameters()
        }, {
            'params': self.trained_critic.parameters()
        }],
                                    lr=config.learning_rate)

    def solve(self):
        """
        Main method to launch the environment loop

        """
        step = 1

        for i_episode in range(1, self.num_episodes + 1):
            env_info = self.env.reset(train_mode=True)[self.brain_name]
            state = env_info.vector_observations
            rewards = []
            surrogates = []
            while True:
                action, log_probability = self.act(state)
                env_info = self.env.step(action)[self.brain_name]
                next_state = env_info.vector_observations
                reward = env_info.rewards
                done = env_info.local_done
                self.memory.add(
                    (state, action, log_probability, next_state, reward, done))
                state = next_state
                rewards.append(reward)
                if not step % self.update_every:
                    surrogate_buffer = self.train_loop()
                    surrogates.append(surrogate_buffer)
                step += 1
                if np.any(done):
                    break

            self.scores.append(np.mean(np.sum(rewards, axis=0)))
            self.surrogates.append(np.mean(surrogates))

            self.epsilon *= self.epsilon_decay
            self.print_status(i_episode)

        return self.scores, self.surrogates

    def act(self, states):
        """
        Based on states, returns the on-policy actions
        
        Parameter
        ---------
        states - float array shape=(num_agents, state_size)
        
        Return
        ---------
        Float array shape=(num_agents, action_size), chosen action

        """
        states = torch.from_numpy(states).float().to(device)
        self.policy.eval()
        actions, log_probabilities = self.policy.next_actions(states)
        return actions.cpu().data.numpy(), log_probabilities.cpu().data.numpy()

    def train_loop(self):
        """
        Training routine to update the policy and critic

        """
        surrogate_buffer = []
        states, actions, old_log_probabilities, next_states, rewards, dones = self.memory.get_latest(
            self.update_every)

        future_rewards = self.compute_discounted_future_rewards(rewards)

        old_log_probabilities = torch.from_numpy(
            old_log_probabilities).float().to(device)
        states = torch.from_numpy(states).float().to(device)
        actions = torch.from_numpy(actions).float().to(device)
        next_states = torch.from_numpy(next_states).float().to(device)
        future_rewards = torch.from_numpy(future_rewards).float().to(device)
        dones = torch.from_numpy(dones).bool().to(device)
        self.policy.train()
        self.trained_critic.train()
        for _ in range(self.train_iterations):
            surrogate = self.compute_surrogate(old_log_probabilities, states,
                                               actions, next_states,
                                               future_rewards, dones)
            surrogate_buffer.append(surrogate.cpu().data.numpy())
            self.optimizer.zero_grad()
            surrogate.backward()
            self.optimizer.step()
            self.target_network_update()
        return surrogate_buffer

    def compute_surrogate(self, old_log_probabilities, states, actions,
                          next_states, future_rewards, dones):
        """
        Compute the surrogate, i.e. the function optimized at training time

        Parameters
        ----------
        - old_log_probabilities, float Tensor shape=(batch_size, num_agents), original probabilities for the performed action
        - states, float Tensor shape=(batch_size, num_agents, state_size)
        - actions, float Tensor shape=(batch_size, num_agents, action_size)
        - next_states, float Tensor shape=(batch_size, num_agents, state_size)
        - future_rewards, float Tensor shape=(batch_size, num_agents), discounted sum of future rewards over the length of the trajectory
        - dones, float Tensor shape=(batch_size, num_agents)

        Return
        ---------
        Surrogate, float Tensor

        """
        new_log_probabilities, entropy = self.policy.get_log_probabilities_and_entropy(
            states, actions)
        ratio = torch.exp(new_log_probabilities - old_log_probabilities)

        with torch.no_grad():
            states_values = self.target_critic(states)
            next_states_values = self.target_critic(next_states[-1, :])
        if torch.any(dones):
            final_states_values = 0
        else:
            final_states_values = next_states_values.expand(
                states_values.shape)

        future_rewards = self.normalize(future_rewards)

        discount = self.gamma**torch.arange(len(states_values),
                                            0,
                                            -1,
                                            dtype=torch.float).unsqueeze(1)
        target_states_values = future_rewards + final_states_values * discount
        advantages = target_states_values - states_values

        clip = torch.clamp(ratio, 1 - self.epsilon, 1 + self.epsilon)
        clipped_surrogate = torch.min(ratio * advantages, clip * advantages)

        return -1 * torch.mean(
            clipped_surrogate) + 0.5 * self.trained_critic.mse(
                states_values, target_states_values) - 0.01 * entropy.mean()

    def normalize(self, a):
        """
        Normalize a torch Tensor

        Parameters
        ----------
        - a, float Tensor to normalize

        """
        mean = torch.mean(a, -1)
        std = torch.std(a, -1)
        b = a
        mask = std != 0
        b[mask] = (a[mask] - mean[mask].unsqueeze(1)) / std[mask].unsqueeze(1)
        # if the deviation is null set the normalized reward to 0
        mask = std == 0
        b[mask] = 0
        return b

    def compute_discounted_future_rewards(self, rewards):
        """
        Compute the discounted sum of future reward over the trajectory

        Parameters
        ----------
        - rewards, float array shape=(batch_size, num_agents)

        Return
        ----------
        Discounted future rewards, float array shape=(batch_size, num_agents)

        """
        # This is complex so giving an example with gamma = 0.5 and
        # rewards = [[1, 0],
        #            [1, 1]]
        main_dim = len(rewards)
        # discounts = [1, 0.5]
        discounts = (self.gamma**np.arange(main_dim))
        # discounts = [[1, 0.5],
        #              [1, 0.5]]
        discounts = np.tile(discounts, main_dim).reshape(main_dim, main_dim)
        # indexes = [[0, 1],
        #            [1, 2]]
        indexes = np.tile(np.arange(main_dim), main_dim).reshape(
            main_dim, main_dim) + np.arange(main_dim)[:, np.newaxis]
        # indexes = [[0, 1],
        #            [1, 0]]
        indexes = np.mod(indexes, main_dim)
        # discounts = [[1, 0.5],
        #              [0, 1]]
        discounts = np.triu(discounts[range(main_dim), indexes])
        # rewards = [[1.5, 0.5],
        #              [1, 1]]
        return np.dot(discounts, rewards)

    def target_network_update(self):
        """
        Performs a soft update with rate tau from the trained_model to the target_model.

        """
        target_model_weights = self.target_critic.get_weights()
        train_model_weights = self.trained_critic.get_weights()
        new_weights = []
        for w1, w2 in zip(target_model_weights, train_model_weights):
            new_weights.append(w1 * (1 - self.tau) + w2 * self.tau)
        self.target_critic.set_weights(new_weights)

    def print_status(self, i_episode):
        """
        Print the latest status of the agent

        Parameter
        ---------
        i_episode, int

        """
        print(
            "\rEpisode %d/%d | Average Score: %.2f | Model surrogate: %.5f   "
            % (i_episode, self.num_episodes, self.scores[-1],
               self.surrogates[-1]),
            end="")
        sys.stdout.flush()