def __init__(self, state_size, action_size,n_agents, buffer, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ # Set given state and action sizes self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network having local and target network for soft updates self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network having local and target network for soft updates self.critic_local = Critic(state_size, action_size,n_agents, random_seed).to(device) self.critic_target = Critic(state_size, action_size,n_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process to boost exploration and hence learning of the network self.noise = OUNoise(action_size, random_seed) self.memory = buffer
def __init__(self, num, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== num (int): number of this agent state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num = num self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size*2+action_size*2, action_size, random_seed).to(device) self.critic_target = Critic(state_size*2+action_size*2, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale = 0.2)
def __init__(self, state_size, action_size, seed): """ Initialize an Agent object INPUT: state_size (int): dim of each state action_size (int): dim of each action seed (int): random seed """ super(MADDPG_Agent, self).__init__() self.state_size = state_size self.action_size = action_size #replace: self.seed = torch.manual_seed(seed) random.seed(seed) # initialise local network and target network for Actor self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr = lr_actor) # initialize local network and target network for Critic self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr = lr_critic, weight_decay = weight_decay) # initialize the Ornstein-Uhlenbeck noise process self.noise = OUNoise((n_agents, action_size), seed) # initialize Shared Replay Buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # initialize time step to keep track of update self.t_step = 0
def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) if not MADDPGAgent.critic_local: MADDPGAgent.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) if not MADDPGAgent.critic_target: MADDPGAgent.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) if not MADDPGAgent.critic_optimizer: MADDPGAgent.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) self.critic_local = MADDPGAgent.critic_local self.critic_target = MADDPGAgent.critic_target self.critic_optimizer = MADDPGAgent.critic_optimizer # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory
class Agent(): """Interacts with and learns from the environment.""" memory = None def __init__(self, num, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== num (int): number of this agent state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num = num self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size*2+action_size*2, action_size, random_seed).to(device) self.critic_target = Critic(state_size*2+action_size*2, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, scale = 0.2) def act(self, state, add_noise=True, noise_amplitude=0.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * noise_amplitude return np.clip(action, -1, 1) def reset(self): self.noise.reset()
class MADDPG_Agent(): def __init__(self, state_size, action_size, seed): """ Initialize an Agent object INPUT: state_size (int): dim of each state action_size (int): dim of each action seed (int): random seed """ super(MADDPG_Agent, self).__init__() self.state_size = state_size self.action_size = action_size #replace: self.seed = torch.manual_seed(seed) random.seed(seed) # initialise local network and target network for Actor self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optim = optim.Adam(self.actor_local.parameters(), lr = lr_actor) # initialize local network and target network for Critic self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optim = optim.Adam(self.critic_local.parameters(), lr = lr_critic, weight_decay = weight_decay) # initialize the Ornstein-Uhlenbeck noise process self.noise = OUNoise((n_agents, action_size), seed) # initialize Shared Replay Buffer self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed) # initialize time step to keep track of update self.t_step = 0 def hard_update(self, local_model, target_model): """ copy weights from source to target network (part of initialization)""" for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): """each agent adding their experience tuples in the replay buffer""" for i in range(n_agents): self.memory.add(states[i, :], actions[i, :], rewards[i], next_states[i, :], dones[i]) self.t_step = (self.t_step + 1) % update_every if self.t_step == 0: # if enough samples are there then learn if len(self.memory) > batch_size: # update the networks update_freq times at each update step for i in range(update_freq): experiences = self.memory.sample() self.learn(experiences, gamma)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size,n_agents, buffer, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ # Set given state and action sizes self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network having local and target network for soft updates self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network having local and target network for soft updates self.critic_local = Critic(state_size, action_size,n_agents, random_seed).to(device) self.critic_target = Critic(state_size, action_size,n_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process to boost exploration and hence learning of the network self.noise = OUNoise(action_size, random_seed) self.memory = buffer def step(self): """Save experience in replay memory, and use random sample from buffer to learn.""" if len(self.memory) > BATCH_SIZE: # Do only if batch is full experiences = self.memory.sample() # draw sample self.learn(experiences, GAMMA) def act(self, state, add_noise=1.0): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) # from numpy state to torch tensor self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() # Forward state to get action probs self.actor_local.train() action += self.noise.sample()*add_noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ l_states, l_actions, rewards, l_next_states, dones = experiences t_states = torch.cat(l_states, dim=1).to(device) t_actions = torch.cat(l_actions, dim=1).to(device) t_next_states = torch.cat(l_next_states, dim=1).to(device) # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models t_next_actions = torch.cat([self.actor_target(states) for states in l_states] , dim=1).to(device) Q_targets_next = self.critic_target(t_next_states, t_next_actions) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(t_states, t_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss # take the current states and predict actions t_actions_pred = torch.cat([self.actor_local(states) for states in l_states] , dim=1).to(device) actor_loss = -self.critic_local(t_states, t_actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class MADDPGAgent(): """Interacts with and learns from the environment.""" critic_local = None critic_target = None critic_optimizer = None def __init__(self, state_size, action_size, memory, device='cpu', params=None): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action memory (obj): Memory buffer to sample device (str): device string between cuda:0 and cpu params (dict): hyper-parameters """ self.state_size = state_size self.action_size = action_size self.device = device self.step_t = 0 self.update_every = params['update_every'] # Set parameters self.gamma = params['gamma'] self.tau = params['tau'] self.seed = random.seed(params['seed']) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_target = Actor(state_size, action_size, params['seed'], params['actor_units'][0], params['actor_units'][1]).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=params['lr_actor']) # Critic Network (w/ Target Network) if not MADDPGAgent.critic_local: MADDPGAgent.critic_local = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) if not MADDPGAgent.critic_target: MADDPGAgent.critic_target = Critic(state_size, action_size, params['seed'], params['critic_units'][0], params['critic_units'][1]).to(device) if not MADDPGAgent.critic_optimizer: MADDPGAgent.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=params['lr_critic'], weight_decay=params['weight_decay']) self.critic_local = MADDPGAgent.critic_local self.critic_target = MADDPGAgent.critic_target self.critic_optimizer = MADDPGAgent.critic_optimizer # Noise process self.noise = OUNoise(action_size, params['seed'], theta=params['noise_theta'], sigma=params['noise_sigma']) # Replay memory self.memory = memory def store_actor_weights(self, filename): """Store weights of Actor Params ====== filename (str): string of filename to store weights of actor """ torch.save(self.actor_local.state_dict(), filename) def store_critic_weights(self, filename): """Store weights of Critic Params ====== filename (str): string of filename to store weights of critic """ torch.save(self.critic_local.state_dict(), filename) def load_actor_weights(self, filename): """Load weights of Actor Params ====== filename (str): string of filename to load weights of actor """ self.actor_local.load_state_dict(torch.load(filename)) def load_critic_weights(self, filename): """Load weights of Critic Params ====== filename (str): string of filename to load weights of critic """ self.critic_local.load_state_dict(torch.load(filename)) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) self.step_t = (self.step_t + 1) % self.update_every # Learn, if enough samples are available in memory if self.step_t == 0 and len(self.memory) > self.memory.get_batch_size(): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) @staticmethod def soft_update(local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def __init__(self, config): self.config = config """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size (int) : replay buffer size batch_size (int) : minibatch size gamma (float) : discount factor tau (float) : for soft update of target parameter lr_actor (float) : learning rate of the actor lr_critic (float) : learning rate of the critic weight_decay (float) : L2 weight decay ou_mu (float) : OUNoise mu ou_theta (float) : OUNoise theta ou_sigma (float) : OUNoise sigma update_every_t_steps (int): timesteps between updates num_of_updates (int): num of update passes when updating """ print( "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n" .format( self.config.num_agents, self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units, self.config.critic_fcs1_units, self.config.critic_fc2_units, self.config.buffer_size, self.config.batch_size, self.config.gamma, self.config.tau, self.config.lr_actor, self.config.lr_critic, self.config.weight_decay, self.config.ou_mu, self.config.ou_theta, self.config.ou_sigma, self.config.update_every_t_steps, self.config.num_of_updates)) # Actor Network (w/ Target Network) self.actor_local = Actor(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units).to(device) self.actor_target = Actor(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.critic_fcs1_units, self.config.critic_fc2_units).to(device) self.critic_target = Critic(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.critic_fcs1_units, self.config.critic_fc2_units).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.config.lr_critic, weight_decay=self.config.weight_decay) # Noise process self.noise = OUNoise(self.config.action_size, self.config.random_seed, mu=self.config.ou_mu, theta=self.config.ou_theta, sigma=self.config.ou_sigma) # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.random_seed)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, config): self.config = config """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed buffer_size (int) : replay buffer size batch_size (int) : minibatch size gamma (float) : discount factor tau (float) : for soft update of target parameter lr_actor (float) : learning rate of the actor lr_critic (float) : learning rate of the critic weight_decay (float) : L2 weight decay ou_mu (float) : OUNoise mu ou_theta (float) : OUNoise theta ou_sigma (float) : OUNoise sigma update_every_t_steps (int): timesteps between updates num_of_updates (int): num of update passes when updating """ print( "[AGENT INFO] DDPG constructor initialized parameters:\n num_agents={} \n state_size={} \n action_size={} \n random_seed={} \n actor_fc1_units={} \n actor_fc2_units={} \n critic_fcs1_units={} \n critic_fc2_units={} \n buffer_size={} \n batch_size={} \n gamma={} \n tau={} \n lr_actor={} \n lr_critic={} \n weight_decay={} \n ou_mu={}\n ou_theta={}\n ou_sigma={}\n update_every_t_steps={}\n num_of_updates={}\n" .format( self.config.num_agents, self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units, self.config.critic_fcs1_units, self.config.critic_fc2_units, self.config.buffer_size, self.config.batch_size, self.config.gamma, self.config.tau, self.config.lr_actor, self.config.lr_critic, self.config.weight_decay, self.config.ou_mu, self.config.ou_theta, self.config.ou_sigma, self.config.update_every_t_steps, self.config.num_of_updates)) # Actor Network (w/ Target Network) self.actor_local = Actor(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units).to(device) self.actor_target = Actor(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.actor_fc1_units, self.config.actor_fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.critic_fcs1_units, self.config.critic_fc2_units).to(device) self.critic_target = Critic(self.config.state_size, self.config.action_size, self.config.random_seed, self.config.critic_fcs1_units, self.config.critic_fc2_units).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.config.lr_critic, weight_decay=self.config.weight_decay) # Noise process self.noise = OUNoise(self.config.action_size, self.config.random_seed, mu=self.config.ou_mu, theta=self.config.ou_theta, sigma=self.config.ou_sigma) # Replay memory self.memory = ReplayBuffer(self.config.action_size, self.config.buffer_size, self.config.batch_size, self.config.random_seed) def reset(self): self.noise.reset() def step(self, states, actions, rewards, next_states, dones, agent_number, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(states, actions, rewards, next_states, dones) # Learn, if enough samples are available in memory if len( self.memory ) > self.config.batch_size and timestep % self.config.update_every_t_steps == 0: for _ in range(self.config.num_of_updates): experiences = self.memory.sample() self.learn(experiences, self.config.gamma, agent_number) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.config.num_agents, self.config.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.noise.sample() actions = np.clip(actions, -1, 1) return actions def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.tau) self.soft_update(self.actor_local, self.actor_target, self.config.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)