class DDPG: def __init__(self, state_size, action_size, random_seed, hyperparams): self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.hyperparams = hyperparams self.actor = Actor(state_size, action_size, random_seed).to(device) self.actor_noise = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=hyperparams.alpha_actor) self.critic = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optim = optim.Adam( self.critic.parameters(), lr=hyperparams.alpha_critic, weight_decay=hyperparams.weight_decay, ) self.replay_buffer = ReplayBuffer(hyperparams.buffer_size, hyperparams.batch_size, random_seed) self.noise = OUNoise( action_size, random_seed, self.hyperparams.mu, self.hyperparams.theta, self.hyperparams.sigma, ) def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) if len(self.replay_buffer) > self.hyperparams.batch_size: observations = self.replay_buffer.sample() self.update_params(observations) def select_action(self, state, train=True, nn_noise=False): state = torch.from_numpy(state).to(dtype=torch.float32, device=device) self.actor.eval() if nn_noise: action = self.actor_noise(state).cpu().data.numpy() else: action = self.actor(state).cpu().data.numpy() self.actor.train() if train: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset_state() def update_params(self, observations): states, actions, rewards, next_states, dones = observations next_actions = self.actor_target(next_states) next_Q_values = self.critic_target(next_states, next_actions) Q_values = rewards + (self.hyperparams.gamma * next_Q_values * (1 - dones)) expected_Q = self.critic(states, actions) Q_values_loss = F.l1_loss(expected_Q, Q_values) self.critic_optim.zero_grad() Q_values_loss.backward() self.critic_optim.step() policy_loss = -self.critic(states, self.actor(states)) policy_loss = policy_loss.mean() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() for qtarget_param, qlocal_param in zip(self.critic_target.parameters(), self.critic.parameters()): qtarget_param.data.copy_(self.hyperparams.tau * qlocal_param.data + (1.0 - self.hyperparams.tau) * qtarget_param.data) for target_param, local_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.hyperparams.tau * local_param.data + (1.0 - self.hyperparams.tau) * target_param.data)
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, actor_learning_rate=1e-4, critic_learning_rate=1e-3, discount_rate=0.99, tau=0.1, steps_per_update=4, dropout_p=0.0, weight_decay=0.0001, noise_max=0.2, noise_decay=1.0, n_agents=1): self.device: torch.device = device self.state_size = state_size self.action_size = action_size self.n_agents = n_agents def make_critic(): critic = Critic(state_size * n_agents, action_size * n_agents) critic = critic.to(device) return critic self.critic_control = make_critic() self.critic_control.dropout.p = dropout_p self.critic_target = make_critic() self.critic_target.eval() self.critic_optimizer = torch.optim.Adam( self.critic_control.parameters(), weight_decay=weight_decay, lr=critic_learning_rate) self.actor_control = Actor(state_size, action_size).to(device) self.actor_control.dropout.p = dropout_p self.actor_target = Actor(state_size, action_size).to(device) self.actor_target.eval() self.actor_optimizer = torch.optim.Adam( self.actor_control.parameters(), weight_decay=weight_decay, lr=actor_learning_rate) self.batch_size = batch_size self.min_buffer_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size, n_agents) self.discount_rate = discount_rate self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update self.noise_max = noise_max self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max) self.noise_decay = noise_decay def policy(self, state, training=True): state = torch.from_numpy(state).float().to(self.device) self.actor_control.eval() with torch.no_grad(): action = self.actor_control(state).cpu().numpy() self.actor_control.train() if training: noise = self.noise.sample() action += noise return np.clip(action, -1, 1) def step(self, state, action, reward, next_state, done): self.actor_control.noise(self.noise.sigma) p = self.calculate_p(state, action, reward, next_state, done) self.replay_buffer.add(state, action, reward, next_state, done, p) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.min_buffer_size: return indicies, (states, actions, rewards, next_states, dones, p) = \ self.replay_buffer.sample(self.batch_size) self.actor_control.eval() error = self.bellman_eqn_error(states, actions, rewards, next_states, dones) self.actor_control.train() importance_scaling = (self.replay_buffer.buffer_size * p.unsqueeze(1).repeat(1, 2, 1))**-1 importance_scaling /= importance_scaling.max() self.critic_optimizer.zero_grad() loss = (importance_scaling * (error**2)).sum() / self.batch_size loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() expected_actions = self.actor_control(unpack_agents(states)) expected_actions = pack_agents(self.n_agents, expected_actions) critic_score = self.critic_control(agents_to_global(states), agents_to_global(expected_actions)) critic_score = global_to_agents(critic_score) loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size loss.backward() self.actor_optimizer.step() self.update_target(self.critic_control, self.critic_target) self.update_target(self.actor_control, self.actor_target) self.replay_buffer.update(indicies, (error.detach().abs().cpu() + 1e-3).mean(dim=1)) def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ next_actions = self.actor_control(unpack_agents(next_states)) next_actions = pack_agents(self.n_agents, next_actions) next_states_global = agents_to_global(next_states) next_actions_global = agents_to_global(next_actions) target_action_values = self.critic_target(next_states_global, next_actions_global) target_action_values = global_to_agents(target_action_values) target_rewards = (rewards + self.discount_rate * (1 - dones) * target_action_values) states = agents_to_global(states) actions = agents_to_global(actions) current_rewards = self.critic_control(states, actions) current_rewards = global_to_agents(current_rewards) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state).float().to( self.device).unsqueeze(0) state = torch.from_numpy(state).float().to(self.device).unsqueeze(0) action = torch.from_numpy(action).float().to(self.device).unsqueeze(0) reward = torch.from_numpy(reward).float().to(self.device).unsqueeze(0) done = torch.from_numpy(done).float().to(self.device).unsqueeze(0) done = done.unsqueeze(2) reward = reward.unsqueeze(2) self.actor_control.eval() self.critic_control.eval() with torch.no_grad(): error = abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 self.critic_control.train() self.actor_control.train() return error.mean(dim=1) def update_target(self, control, target): for target_param, control_param in zip(target.parameters(), control.parameters()): target_param.data.copy_(self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self, final_score): self.step_count = 0 self.noise.sigma *= self.noise_decay self.last_score = final_score self.noise.reset() def save(self, path): torch.save(self.critic_control.state_dict(), path + '-critic.p') torch.save(self.actor_control.state_dict(), path + '-actor.p') def restore(self, path): self.critic_control.load_state_dict( torch.load(path + '-critic.p', map_location='cpu')) self.actor_control.load_state_dict( torch.load(path + '-actor.p', map_location='cpu'))
class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor determines what to do based on the policy def act_local(self, state): # Given a state return the action recommended by the policy actor_local # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Return actions tensor return actions.detach() def act_target(self, states): # Pass the state to the actor target model to get an action # recommend for the policy in a state # set the actor_target model to predict not to train self.actor_target.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_target(states) # set the model back to training mode self.actor_target.train() # Return actions tensor return actions.detach() def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class DDPG(): """DDPG agent""" def __init__(self, state_size, action_size, params, seed): """Initialize a DDPG agent Params ====== state_size (int): dimension of each state action_size (int): dimension of each action params (Params): hyperparameters seed (int): random seed """ self.gamma = params.gamma self.tau = params.tau self.seed = np.random.seed(seed) # actor networks self.actor_local = Actor(state_size, action_size, params.units_actor, seed).to(device) self.actor_target = Actor(state_size, action_size, params.units_actor, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), params.lr_actor) # critic newtworks self.critic_local = Critic(state_size, action_size, params.units_critic, seed).to(device) self.critic_target = Critic(state_size, action_size, params.units_critic, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), params.lr_critic) # Noise process self.noise = OUNoise(action_size, seed, params.mu, params.theta, params.sigma) def noise_reset(self): self.noise.reset() def act(self, state): """Returns actions for given state(s) as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).data.cpu().numpy() self.actor_local.train() action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s') tuples """ states, actions, rewards, next_states, dones = experiences #### Update critic # Get predicted next-state actions from actor_target model next_actions = self.actor_target(next_states) # Get predicted next-state Q-Values from critic_target model next_q_targets = self.critic_target(next_states, next_actions) # Compute Q targets for current states Q_targets = rewards + self.gamma * next_q_targets * (1.0 - dones) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize critic loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ### Update actor # Compute actor loss predicted_actions = self.actor_local(states) actor_loss = -self.critic_local(states, predicted_actions).mean() # Minimize actor loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.total_reward = 0.0 self.count = 0 self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 self.batch_size = 128 self.update_every = 1 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor interact with the environment through the step def step(self, state, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored experience tuple in the replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every update_times time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action (not needed) # self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Because we are exploring we add some noise to the # action vector return list(actions.detach().numpy().reshape(4, ) + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every member of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Now reshape the numpy arrays for states, actions and next_states to torch tensors # rewards and dones does not need to be tensors. states = torch.from_numpy(states).float().unsqueeze(0).to(device) actions = torch.from_numpy(actions).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze(0).to( device) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.actor_target.eval() with torch.no_grad(): next_state_actions = self.actor_target(next_states).detach() self.actor_target.train() # The critic evaluates the actions taking by the actor in the next state and generates the # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or q_value function inputs is states, actions # We calculate the q_targets of the next state. We will use this to calculate the current # state q_value using the bellman equation. # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.critic_target.eval() with torch.no_grad(): q_targets_next_state_action_values = self.critic_target( next_states, next_state_actions).detach() self.actor_target.train() # With the next state q_value that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a). # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic_local model in a supervise learning fashion, this is the target values. q_targets = torch.from_numpy( rewards + self.gamma * q_targets_next_state_action_values.numpy() * (1 - dones)).float() # --- Optimize the local Critic Model ----# # Here we start the supervise training process of the critic_local network # we pass a bunch of states actions samples it produces the expected output # q_value of each action we passed. q_expected = self.critic_local(states, actions) # Clear grad buffer values in preparation. self.critic_optimizer.zero_grad() # loss function for the critic_local model mean square of the difference # between the q_expected value and the q_target value. critic_loss = F.smooth_l1_loss(q_expected, q_targets) critic_loss.backward(retain_graph=True) # gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # optimize the critic_local model using the optimizer defined for the critic # In the init function of this class self.critic_optimizer.step() # --- Optimize the local Actor Model ---# # Get the actor actions using the experience buffer states actor_actions = self.actor_local(states) # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the # action of the actor_local model obtain using the states of the sampled buffer. loss_actor = -1 * torch.sum( self.critic_local.forward(states, actor_actions)) # Set the model gradients to zero in preparation self.actor_optimizer.zero_grad() # Back propagate loss_actor.backward() # optimize the actor_local model using the optimizer defined for the actor # In the init function of this class self.actor_optimizer.step() # Soft-update target models self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class DDPGAgent(): def __init__(self, seed, n_state, n_action, batch_size=64, buffer=1e5, gamma=0.99, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, tau=1e-3): self.batch_size = batch_size #init actor self.local_actor = Actor(n_state, n_action, seed).to(device) self.target_actor = Actor(n_state, n_action, seed).to(device) self.optim_actor = torch.optim.Adam(self.local_actor.parameters(), lr=lr_actor) #init critic self.local_critic = Critic(n_state, n_action, seed).to(device) self.target_critic = Critic(n_state, n_action, seed).to(device) self.optim_critic = torch.optim.Adam(self.local_critic.parameters(), lr=lr_critic, weight_decay=weight_decay) #init memory self.memory = memory(int(buffer), device, seed) self.tau = tau self.gamma = gamma self.noise = noise(n_action, seed=seed) def step(self, state, action, reward, next_state, done): event = Event(state, action, reward, next_state, done) self.memory.add(event) self.learn() def act(self, state): state = torch.from_numpy(state).float().to(device) self.local_actor.eval() with torch.no_grad(): action = self.local_actor(state).cpu().data.numpy() self.local_actor.train() action += self.noise.make() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self): """ Update both actor and critic networks """ event_batch = self.memory.sample(self.batch_size) if event_batch is None: return event_batch = self.memory.deserialize(event_batch) self.update_critic(event_batch) self.update_actor(event_batch) self.update_target(self.local_actor, self.target_actor) self.update_target(self.local_critic, self.target_critic) def update_critic(self, batch): ## TD step # t expected_Q = self.local_critic(batch.states, batch.actions) # t+1 actions_pred = self.target_actor(batch.states_next) target_Q_next = self.target_critic(batch.states_next, actions_pred) #only learning from positives? negatives are good source of learning too target_Q = batch.rewards + (self.gamma * target_Q_next * (1 - batch.dones)) loss = nn.functional.mse_loss(expected_Q, target_Q) self.optim_critic.zero_grad() loss.backward() self.optim_critic.step() def update_actor(self, batch): actions_predicted = self.local_actor(batch.states) #fixthis loss = -self.local_critic(batch.states, actions_predicted).mean() #rms self.optim_actor.zero_grad() loss.backward() self.optim_actor.step() def update_target(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class Agent(): """Main DDPG agent that extracts experiences and learns from them""" def __init__(self, state_size, action_size): """ Initializes Agent object. @Param: 1. state_size: dimension of each state. 2. action_size: number of actions. """ self.state_size = state_size self.action_size = action_size #Actor network self.actor_local = Actor(self.state_size, self.action_size).to(device) #local model self.actor_target = Actor(self.state_size, self.action_size).to(device) #target model, TD-target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) #initialize optimizer using Adam as regularizer for Actor network. #Critic network self.critic_local = Critic(self.state_size, self.action_size).to(device) #local model self.critic_target = Critic(self.state_size, self.action_size).to(device) #target model, TD-target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) #initialize optimizer using Adam as regularizer for Critic network. #Noise proccess self.noise = OUNoise(action_size) #define Ornstein-Uhlenbeck process #Replay memory self.memory = ReplayBuffer(self.action_size, BUFFER_SIZE, MINI_BATCH) #define experience replay buffer object def step(self, state, action, reward, next_state, done): """ Saves an experience in the replay memory to learn from using random sampling. @Param: 1. state: current state, S. 2. action: action taken based on current state. 3. reward: immediate reward from state, action. 4. next_state: next state, S', from action, a. 5. done: (bool) has the episode terminated? Exracted version for trajectory used in calculating the value for an action, a.""" self.memory.add(state, action, reward, next_state, done) #append to memory buffer #check if enough samples in buffer. if so, learn from experiences, otherwise, keep collecting samples. if(len(self.memory) > MINI_BATCH): experience = self.memory.sample() self.learn(experience) def reset(self): """Resets the noise process to mean""" self.noise.reset() def act(self, state, add_noise=True): """ Returns a deterministic action given current state. @Param: 1. state: current state, S. 2. add_noise: (bool) add bias to agent, default = True (training mode) """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) #typecast to torch.Tensor self.actor_local.eval() #set in evaluation mode with torch.no_grad(): #reset gradients action = self.actor_local(state).cpu().data.numpy() #deterministic action based on Actor's forward pass. self.actor_local.train() #set training mode #If training mode, i.e. add_noise = True, add noise to the model to learn a more accurate policy for current state. if(add_noise): action += self.noise.sample() return action def learn(self, experiences, gamma=GAMMA): """ Learn from a set of experiences picked up from a random sampling of even frequency (not prioritized) of experiences when buffer_size = MINI_BATCH. Updates policy and value parameters accordingly @Param: 1. experiences: (Tuple[torch.Tensor]) set of experiences, trajectory, tau. tuple of (s, a, r, s', done) 2. gamma: immediate reward hyper-parameter, 0.99 by default. """ #Source from: Udacity/DRL #Extrapolate experience into (state, action, reward, next_state, done) tuples states, actions, rewards, next_states, dones = experiences #Update Critic network actions_next = self.actor_target(next_states) # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # r + γ * Q-values(a,s) # Compute critic loss using MSE Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #Update Actor Network # Compute actor loss actions_pred = self.actor_local(states) #gets mu(s) actor_loss = -self.critic_local(states, actions_pred).mean() #gets V(s,a) # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Copies model τ every experience. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.eps = EPS_START self.eps_decay = 1 / (EPS_EP_END * LEARN_NUM ) # set decay rate based on epsilon end target self.timestep = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory) > BATCH_SIZE and self.timestep % LEARN_EVERY == 0: for _ in range(LEARN_NUM): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): # get action for each agent and concatenate them for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Construct next actions vector relative to the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, EPS_FINAL) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size): self._state_size = state_size self._action_size = action_size # Actor network self._actor_local = Actor(state_size, action_size).to(device) self._actor_target = Actor(state_size, action_size).to(device) self._actor_optimizer = optim.Adam(self._actor_local.parameters()) # Critic network self._critic_local = Critic(state_size, action_size).to(device) self._critic_target = Critic(state_size, action_size).to(device) self._critic_optimizer = optim.Adam(self._critic_local.parameters()) # Memory self._memory = Memory(BUFFER_SIZE) # Do equal weights self.hard_update(self._actor_local, self._actor_target) self.hard_update(self._critic_local, self._critic_target) def step(self, state, action, reward, next_state, done): self._memory.push((state, action, reward, next_state, done)) if len(self._memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): samples = self._memory.sample(BATCH_SIZE) self.learn(samples) def act(self, state): state = torch.from_numpy(state).float().to(device) if binom.rvs(1, PROBABILITY_RAND_STEP): action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs())) else: self._actor_local.eval() with torch.no_grad(): action = self._actor_local(state).cpu().data.numpy() self._actor_local.train() return np.clip(action, -1, 1) def hard_update(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local, target, tau): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def learn(self, samples): states, actions, rewards, next_states, dones = samples actions_next = self._actor_target(next_states) Q_targets_next = self._critic_target(next_states, actions_next) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) Q_expected = self._critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() actions_pred = self._actor_local(states) actor_loss = -self._critic_local(states, actions_pred).mean() self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() self.soft_update(self._critic_local, self._critic_target, TAU) self.soft_update(self._actor_local, self._actor_target, TAU) def save(self): torch.save(self._actor_local.state_dict(), ACTOR_PATH) torch.save(self._critic_local.state_dict(), CRITIC_PATH) def load(self): self._actor_local.load_state_dict(torch.load(ACTOR_PATH)) self._actor_local.eval() self._critic_local.load_state_dict(torch.load(CRITIC_PATH)) self._critic_local.eval()