Python Actor.parameters Examples

Programming Language: Python

Namespace/Package Name: ddpg.models

Class/Type: Actor

Method/Function: parameters

Examples at hotexamples.com: 2

Python Actor.parameters - 2 examples found. These are the top rated real world Python examples of ddpg.models.Actor.parameters extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Actor(2)

forward(2)

parameters(2)

eval(1)

train(1)

Example #1

Show file

File: agent.py Project: alessandroleite/tennis-rl

class DDPGAgent(object):
    """Interacts with and learns from the environment."""
    def __init__(self, id, state_size, action_size, seed, memory, num_agents,
                 hyperparameters: Mapping[str, float]):
        """Initialize a DDPG agent object.
        
        Params
        ======
            id (int): agent's id
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
            memory (ReplayBuffer): replay buffer to store the experience of this agent
            hyperparameters (dictionnary of str:): hyperparameters' values of the model. The expected parameters are:
             - batch_size (int): minibatch size
             - lr_actor (float): learning rate of the actor 
             - lr_critic (float): learning rate of the critic 
             - gamma (float): discount factor
             - weight_decay (float): critic L2 weight decay 
             - tau (float): value for soft update of target parameters
             - update_frequency (int): how much steps must be executed before starting learn
             - n_learns (int): how many learning for update 
        """
        self.id = id
        self.__name__ = 'DDPG'
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = hyperparameters['gamma']
        self.batch_size = int(hyperparameters['batch_size'])
        self.tau = hyperparameters['tau']

        self.update_frequency = int(hyperparameters['update_frequency'])
        self.n_learns = int(hyperparameters['n_learns'])

        # Actor Network (w/ Target Network)
        self.actor_local = Actor(state_size, action_size, seed).to(device)
        self.actor_target = Actor(state_size, action_size, seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=hyperparameters['lr_actor'])

        # Critic Network (w/ Target Network)
        self.critic_local = Critic(state_size, action_size, num_agents,
                                   seed).to(device)
        self.critic_target = Critic(state_size, action_size, num_agents,
                                    seed).to(device)
        self.critic_optimizer = optim.Adam(
            self.critic_local.parameters(),
            lr=hyperparameters['lr_critic'],
            weight_decay=hyperparameters['weight_decay'])

        # Noise process
        self.noise = Ornstein(action_size)

        # Replay memory
        self.memory = memory

        # Initialize the time step (for every update_frequency steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done, other_states,
             other_actions, other_next_states):
        """Save experience in replay memory, and use random sample from buffer to learn."""
        self.memory.add(state, action, reward, next_state, done, other_states,
                        other_actions, other_next_states)

        self.t_step = (self.t_step + 1) % self.update_frequency
        if self.t_step == 0:
            # Learn, if enough samples are available in memory
            for _ in range(self.n_learns):
                if len(self.memory) > self.batch_size:
                    experiences = self.memory.sample(self.batch_size)
                    self.learn(experiences, self.gamma)

    def act(self, states, add_noise=True):
        """Returns actions for given state as per current policy."""
        states = torch.from_numpy(states).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(states).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample()
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples.
        Q_targets = r + γ * critic_target(next_state, actor_target(next_state))
        where:
            actor_target(state) -> action
            critic_target(state, action) -> Q-value

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, _, _, _, _, other_states, _, _ = experiences

        self.update_critic(experiences, gamma)
        self.update_actor(states, other_states)
        self.update_target_networks()

    def update_critic(self, experiences, gamma):
        """Update the critic network given the experiences"""

        states, actions, rewards, next_states, dones, other_states, other_actions, other_next_states = experiences

        all_states = torch.cat((states, other_states), dim=1).to(device)
        all_actions = torch.cat((actions, other_actions), dim=1).to(device)
        all_next_states = torch.cat((next_states, other_next_states),
                                    dim=1).to(device)

        local_all_next_actions = []
        local_all_next_actions.append(self.actor_target(states))
        local_all_next_actions.append(self.actor_target(other_states))
        all_next_actions = torch.cat(local_all_next_actions, dim=1).to(device)

        # ---------------------------- update critic ---------------------------- #
        # Get predicted next-state actions and Q values from target models
        Q_targets_next = self.critic_target(all_next_states, all_next_actions)
        # Compute Q targets for current states (y_i)
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        # Compute critic loss
        Q_expected = self.critic_local(all_states, all_actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)
        # Minimize the loss
        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        self.critic_optimizer.step()

    def update_actor(self, states, other_states):

        all_states = torch.cat((states, other_states), dim=1).to(device)

        # ---------------------------- update actor ---------------------------- #
        # Compute actor loss
        actions_pred = self.actor_local(states)
        other_actions_pred = self.actor_local(other_states)
        other_actions_pred = other_actions_pred.detach()

        actions_pred = torch.cat((actions_pred, other_actions_pred),
                                 dim=1).to(device)
        actor_loss = -self.critic_local(all_states, actions_pred).mean()

        # Minimize the loss
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

    def update_target_networks(self):

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic_local, self.critic_target, self.tau)
        self.soft_update(self.actor_local, self.actor_target, self.tau)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)

Example #2

Show file

File: ddpg.py Project: uspas/2020_optimization_and_ml

class DDPGAgent:
    def __init__(self,
                 env,
                 gamma,
                 tau,
                 buffer_maxlen,
                 critic_learning_rate,
                 actor_learning_rate,
                 max_action=1):
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        self.env = env
        self.obs_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.noise = OUNoise(env.action_space)
        self.iter = 0.0
        self.noisy = False
        self.max_action = max_action

        print(self.action_dim)
        print(self.obs_dim)

        # RL hyperparameters
        self.env = env
        self.gamma = gamma
        self.tau = tau

        # Initialize critic and actorr networks
        self.critic = Critic(self.obs_dim, self.action_dim).to(self.device)
        self.critic_target = Critic(self.obs_dim,
                                    self.action_dim).to(self.device)

        self.actor = Actor(self.obs_dim, self.action_dim,
                           self.max_action).to(self.device)
        self.actor_target = Actor(self.obs_dim,
                                  self.action_dim).to(self.device)

        # Copy target network paramters for critic
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        # Set Optimization algorithms
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=critic_learning_rate)
        self.actor_optimizer = optim.Adam(self.actor.parameters(),
                                          lr=actor_learning_rate)

        self.replay_buffer = ExperienceReplayLog(buffer_maxlen)

    def get_action(self, obs):
        #print('obs;',obs)

        if self.noisy == True:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()
            action = self.noise.get_action(action, self.iter)
            self.iter = self.iter + 1

        else:
            state = torch.FloatTensor(obs).unsqueeze(0).to(self.device)
            action = self.actor.forward(state)
            action = action.squeeze(0).cpu().detach().numpy()

        return action

    def update(self, batch_size):

        #Batch updates
        states, actions, rewards, next_states, _ = self.replay_buffer.sample(
            batch_size)
        state_batch, action_batch, reward_batch, next_state_batch, masks = self.replay_buffer.sample(
            batch_size)
        state_batch = torch.FloatTensor(state_batch).to(self.device)
        action_batch = torch.FloatTensor(action_batch).to(self.device)
        reward_batch = torch.FloatTensor(reward_batch).to(self.device)
        next_state_batch = torch.FloatTensor(next_state_batch).to(self.device)
        masks = torch.FloatTensor(masks).to(self.device)

        # Q info updates
        curr_Q = self.critic.forward(state_batch, action_batch)
        next_actions = self.actor_target.forward(next_state_batch)
        next_Q = self.critic_target.forward(next_state_batch,
                                            next_actions.detach())
        expected_Q = reward_batch + self.gamma * next_Q

        # Update Critic network
        q_loss = F.mse_loss(curr_Q, expected_Q.detach())

        self.critic_optimizer.zero_grad()
        q_loss.backward()

        self.critic_optimizer.step()

        # Update Actor network
        policy_loss = -self.critic.forward(
            state_batch, self.actor.forward(state_batch)).mean()

        self.actor_optimizer.zero_grad()
        policy_loss.backward()
        self.actor_optimizer.step()

        # Update Actor and Critic target networks
        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))

        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data * self.tau + target_param.data *
                                    (1.0 - self.tau))