class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor determines what to do based on the policy def act_local(self, state): # Given a state return the action recommended by the policy actor_local # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Return actions tensor return actions.detach() def act_target(self, states): # Pass the state to the actor target model to get an action # recommend for the policy in a state # set the actor_target model to predict not to train self.actor_target.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_target(states) # set the model back to training mode self.actor_target.train() # Return actions tensor return actions.detach() def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class AgentDDPG: def __init__(self, state_size, action_size, seed): """ :state_size: size of the state vector :action_size: size of the action vector """ self.state_size = state_size self.action_size = action_size self.t_step = 0 self.score = 0.0 self.best = 0.0 self.seed = seed self.total_reward = 0.0 self.count = 0 self.learning_rate_actor = 0.0001 self.learning_rate_critic = 0.001 self.batch_size = 128 self.update_every = 1 # Instances of the policy function or actor and the value function or critic # Actor critic with Advantage # Actor local and target network definitions self.actor_local = Actor(self.state_size, self.action_size, self.seed).to(device) self.actor_target = Actor(self.state_size, self.action_size, self.seed).to(device) # Critic local and target self.critic_local = Critic(self.state_size, self.action_size, self.seed).to(device) self.critic_target = Critic(self.state_size, self.action_size, self.seed).to(device) # Actor Optimizer self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.learning_rate_actor) # Critic Optimizer self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.learning_rate_critic) # Make sure local and target start with the same weights self.actor_target.load_state_dict(self.actor_local.state_dict()) self.critic_target.load_state_dict(self.critic_local.state_dict()) # Initialize the Gaussin Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Initialize the Replay Memory self.buffer_size = 1000000 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Parameters for the Algorithm self.gamma = 0.99 # Discount factor self.tau = 0.001 # Soft update for target parameters Actor Critic with Advantage # Actor interact with the environment through the step def step(self, state, action, reward, next_state, done): # Add to the total reward the reward of this time step self.total_reward += reward # Increase your count based on the number of rewards # received in the episode self.count += 1 # Stored experience tuple in the replay buffer self.memory.add(state, action, reward, next_state, done) # Learn every update_times time steps. self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check to see if you have enough to produce a batch # and learn from it if len(self.memory) > self.batch_size: experiences = self.memory.sample() # Train the networks using the experiences self.learn(experiences) # Roll over last state action (not needed) # self.last_state = next_state # Actor determines what to do based on the policy def act(self, state): # Given a state return the action recommended by the policy # Reshape the state to fit the torch tensor input state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Pass the state to the actor local model to get an action # recommend for the policy in a state # set the actor_local model to predict not to train self.actor_local.eval() # set the model so this operation is not counted in the # gradiant calculation. with torch.no_grad(): actions = self.actor_local(state) # set the model back to training mode self.actor_local.train() # Because we are exploring we add some noise to the # action vector return list(actions.detach().numpy().reshape(4, ) + self.noise.sample()) # This is the Actor learning logic called when the agent # take a step to learn def learn(self, experiences): """ Learning means that the networks parameters needs to be updated Using the experineces batch. Network learns from experiences not form interaction with the environment """ # Reshape the experience tuples in separate arrays of states, actions # rewards, next_state, done # Your are converting every member of the tuple in a column or vector states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Now reshape the numpy arrays for states, actions and next_states to torch tensors # rewards and dones does not need to be tensors. states = torch.from_numpy(states).float().unsqueeze(0).to(device) actions = torch.from_numpy(actions).float().unsqueeze(0).to(device) next_states = torch.from_numpy(next_states).float().unsqueeze(0).to( device) # Firs we pass a batch of next states to the actor so it tell us what actions # to execute, we use the actor target network instead of the actor local network # because of the advantage principle # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.actor_target.eval() with torch.no_grad(): next_state_actions = self.actor_target(next_states).detach() self.actor_target.train() # The critic evaluates the actions taking by the actor in the next state and generates the # Q(a,s) value of the next state taking those actions. These action, next_state tuple comes from the # ReplayBuffer not from interacting with the environment. # Remember the Critic or q_value function inputs is states, actions # We calculate the q_targets of the next state. We will use this to calculate the current # state q_value using the bellman equation. # set the target network to predict because this is not part of the training, this model # weights are alter by a soft update not by an optimizer self.critic_target.eval() with torch.no_grad(): q_targets_next_state_action_values = self.critic_target( next_states, next_state_actions).detach() self.actor_target.train() # With the next state q_value that is a vector of action values Q(s,a) of a random selected # next_states from the replay buffer. We calculate the CURRENT state target Q(s,a). # using the TD one-step Sarsa equations and the q_target_next value we got from the critic_target net # We make terminal states target Q(s,a) 0 and Non terminal the Q_targtes value # This is done to train the critic_local model in a supervise learning fashion, this is the target values. q_targets = torch.from_numpy( rewards + self.gamma * q_targets_next_state_action_values.numpy() * (1 - dones)).float() # --- Optimize the local Critic Model ----# # Here we start the supervise training process of the critic_local network # we pass a bunch of states actions samples it produces the expected output # q_value of each action we passed. q_expected = self.critic_local(states, actions) # Clear grad buffer values in preparation. self.critic_optimizer.zero_grad() # loss function for the critic_local model mean square of the difference # between the q_expected value and the q_target value. critic_loss = F.smooth_l1_loss(q_expected, q_targets) critic_loss.backward(retain_graph=True) # gradient clipping torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # optimize the critic_local model using the optimizer defined for the critic # In the init function of this class self.critic_optimizer.step() # --- Optimize the local Actor Model ---# # Get the actor actions using the experience buffer states actor_actions = self.actor_local(states) # Use as a loss the negative sum of the q_values produce by the optimized critic local model given the # action of the actor_local model obtain using the states of the sampled buffer. loss_actor = -1 * torch.sum( self.critic_local.forward(states, actor_actions)) # Set the model gradients to zero in preparation self.actor_optimizer.zero_grad() # Back propagate loss_actor.backward() # optimize the actor_local model using the optimizer defined for the actor # In the init function of this class self.actor_optimizer.step() # Soft-update target models self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def get_episode_score(self): """ Calculate the episode scores :return: None """ # Update score and best score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best: self.best = self.score def save_model_weights(self): torch.save(self.actor_local.state_dict(), './checkpoints.pkl')
class TD3(object): # Twin Delay DDGP def __init__(self, state_dim, action_dim, max_action): self.actor = Actor(state_dim, action_dim, max_action).to(device) self.actor_target = Actor(state_dim, action_dim, max_action).to(device) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) self.critic = Critic(state_dim, action_dim).to(device) self.critic_target = Critic(state_dim, action_dim).to(device) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) self.max_action = max_action def select_action(self, state): state = torch.Tensor(state.reshape(1, -1)).to(device) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.2, noise_clip=0.5, policy_freq=2): for it in range(iterations): # step 4: we sample batch of transitions (s, s', a, r) from the memory batch_state, batch_next_state, batch_actions, batch_rewards, batch_dones = replay_buffer.sample( batch_size) state = torch.Tensor(batch_state).to(device) next_state = torch.Tensor(batch_next_state).to(device) action = torch.Tensor(batch_actions).to(device) reward = torch.Tensor(batch_rewards).to(device) done = torch.Tensor(batch_dones).to(device) # step 5: from the next state s', the actor traget plays next action a' next_action = self.actor_target(next_state) # step 6: we add Gaussian noise to the next action a' and we clamp it in a range of values supported by the environment noise = torch.Tensor(batch_actions).data.normal_( 0, policy_noise).to(device) noise = noise.clamp(-noise_clip, noise_clip) next_action = (next_action + noise).clamp(-self.max_action, self.max_action) # step 7: the two critic targets take each the couple (s', a') as input and return 2 Q-values Qt1(s', a') and Qt2(s', a') as output target_Q1, target_Q2 = self.critic_target(next_state, next_action) # step 8: we keep the minimum of these Q-values min(Qt1, Qt2) target_Q = torch.min(target_Q1, target_Q2) # step 9: we get the final target of the 2 critic models, which is : Qt = r + gamma * target_Q target_Q = reward + (1 - done) * discount * target_Q # step 10: the 2 critic models take each the couple (s, a) as input and return 2 Q-values Qt1(s, a) and Qt2(s, a) as outputs current_Q1, current_Q2 = self.critic(state, action) # step 11: we compute the less coming from the 2 critic models : critic loss = mse_loss(Q1(s,a), Qt) + mse_loss(Q2(s,a), Qt) critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # step 12: we backpropagate the critic loss and update the parameters of the 2 critic models with SGD optimizer self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # step 13: one every 2 iterations, we update our Actor model by performing gradient ascent on the output of the first critic model if it % policy_freq == 0: # deterministic policy gradient DPG actor_loss = -self.critic.Q1(state, self.actor(state)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Delay # step 14: still once every 2 iterations, we update the weights of the actor target by polyak averaging for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) # step 15: still ones every 2 iterations, we uodate the weights of the critic target by polyak averaging for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename, directory): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
class Agent: def __init__(self, device, state_size, action_size, buffer_size=10, batch_size=10, actor_learning_rate=1e-4, critic_learning_rate=1e-3, discount_rate=0.99, tau=0.1, steps_per_update=4, action_range=None, dropout_p=0.0, weight_decay=0.0001, noise_max=0.2, noise_decay=1.0, n_agents=1 ): self.device: torch.device = device self.state_size = state_size self.action_size = action_size self.critic_control = Critic(state_size, action_size).to(device) self.critic_control.dropout.p = dropout_p self.critic_target = Critic(state_size, action_size).to(device) self.critic_target.eval() self.critic_optimizer = torch.optim.Adam( self.critic_control.parameters(), weight_decay=weight_decay, lr=critic_learning_rate) self.actor_control = Actor(state_size, action_size, action_range).to( device) self.actor_control.dropout.p = dropout_p self.actor_target = Actor(state_size, action_size, action_range).to( device) self.actor_target.eval() self.actor_optimizer = torch.optim.Adam( self.actor_control.parameters(), weight_decay=weight_decay, lr=actor_learning_rate) self.batch_size = batch_size self.min_buffer_size = batch_size self.replay_buffer = ReplayBuffer(device, state_size, action_size, buffer_size) self.discount_rate = discount_rate self.tau = tau self.step_count = 0 self.steps_per_update = steps_per_update self.noise_max = noise_max self.noise = OUNoise([n_agents, action_size], 15071988, sigma=self.noise_max) self.noise_decay = noise_decay self.last_score = float('-inf') def policy(self, state, add_noise=True): state = torch.from_numpy(state).float().to(self.device) self.actor_control.eval() with torch.no_grad(): action = self.actor_control(state).cpu().numpy() self.actor_control.train() if add_noise: noise = self.noise.sample() action += noise return action def step(self, state, action, reward, next_state, done): p = self.calculate_p(state, action, reward, next_state, done) for i in range(state.shape[0]): self.replay_buffer.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i], p[i]) if self.step_count % self.steps_per_update == 0: self.learn() self.step_count += 1 def learn(self): if len(self.replay_buffer) < self.min_buffer_size: return indicies, (states, actions, rewards, next_states, dones, p) = \ self.replay_buffer.sample(self.batch_size) self.actor_control.eval() error = self.bellman_eqn_error( states, actions, rewards, next_states, dones) self.actor_control.train() importance_scaling = (self.replay_buffer.buffer_size * p) ** -1 importance_scaling /= importance_scaling.max() self.critic_optimizer.zero_grad() loss = (importance_scaling * (error ** 2)).sum() / self.batch_size loss.backward() self.critic_optimizer.step() self.actor_optimizer.zero_grad() expected_actions = self.actor_control(states) critic_score = self.critic_control(states, expected_actions) loss = -1 * (importance_scaling * critic_score).sum() / self.batch_size loss.backward() self.actor_optimizer.step() self.update_target(self.critic_control, self.critic_target) self.update_target(self.actor_control, self.actor_target) self.replay_buffer.update(indicies, error.detach().abs().cpu() + 1e-3) def bellman_eqn_error(self, states, actions, rewards, next_states, dones): """Double DQN error - use the control network to get the best action and apply the target network to it to get the target reward which is used for the bellman eqn error. """ next_actions = self.actor_control(next_states) target_action_values = self.critic_target(next_states, next_actions) target_rewards = ( rewards + self.discount_rate * (1 - dones) * target_action_values ) current_rewards = self.critic_control(states, actions) error = current_rewards - target_rewards return error def calculate_p(self, state, action, reward, next_state, done): next_state = torch.from_numpy(next_state).float().to( self.device) state = torch.from_numpy(state).float().to(self.device) action = torch.from_numpy(action).float().to(self.device) reward = torch.from_numpy(reward).float().to(self.device) done = torch.from_numpy(done).float().to( self.device) done = done.unsqueeze(1) reward = reward.unsqueeze(1) self.actor_control.eval() self.critic_control.eval() with torch.no_grad(): retval = abs( self.bellman_eqn_error(state, action, reward, next_state, done)) + 1e-3 self.critic_control.train() self.actor_control.train() return retval def update_target(self, control, target): for target_param, control_param in zip( target.parameters(), control.parameters()): target_param.data.copy_( self.tau * control_param.data + (1.0 - self.tau) * target_param.data) def end_of_episode(self, final_score): self.step_count = 0 self.noise.sigma *= self.noise_decay self.last_score = final_score self.noise.reset() def save(self, path): torch.save(self.critic_control.state_dict(), path + '-critic.p') torch.save(self.actor_control.state_dict(), path + '-actor.p') def restore(self, path): self.critic_control.load_state_dict( torch.load(path + '-critic.p', map_location='cpu')) self.actor_control.load_state_dict( torch.load(path + '-actor.p', map_location='cpu'))
class ActorCritic(Model): def __init__(self, observation_space_size, action_space_size, name=None, env_name=None, model_config=None, play_mode=False): if name is None: name = "Unnamed-ActorCritic" super(ActorCritic, self).__init__(observation_space_size, action_space_size, name, env_name, model_config, play_mode) def build_model(self): self.policy_net = Actor(self.observation_space_size, self.action_space_size) self.critic_net = Critic(self.observation_space_size) if self.model_config is None: self.gamma = 0.99 self.actor_optimizer = optim.Adam(self.policy_net.parameters()) self.actor_loss = nn.MSELoss() self.critic_optimizer = optim.Adam(self.critic_net.parameters()) self.critic_loss = nn.MSELoss() self.get_epsilon = self.get_epsilon_default else: pass def save_checkpoint(self, n=0, filepath=None): """ n - number of epoch / episode or whatever is used for enumeration """ # TO DO: ADD OTHER RELEVANT PARAMETERS checkpoint = { 'policy': self.policy_net.state_dict(), 'critic': self.critic_net.state_dict(), 'optimizer': self.optimizer.state_dict() } super(ActorCritic, self).save_checkpoint(n, filepath, checkpoint) def load_checkpoint(self, filepath): # TO DO: ADD OTHER RELEVANT parameters checkpoint = torch.load(filepath) self.policy_net.load_state_dict(checkpoint['policy']) self.critic_net.load_state_dict(checkpoint['critic']) self.optimizer.load_state_dict(checkpoint['optimizer']) def prepare_sample(self, sample): sample = np.array(sample) states = torch.tensor(sample[:, 0], dtype=torch.float32) actions = torch.tensor(sample[:, 1], dtype=torch.float32) rewards = torch.tensor(sample[:, 2], dtype=torch.float32) next_states = torch.tensor(sample[:, 3], dtype=torch.float32) dones = torch.tensor(sample[:, 4], dtype=torch.int32) return states, actions, rewards, next_states, dones def critic_update(self, V, V_target): self.critic_optimizer.zero_grad() critic_loss = self.critic_loss(V, V_target) critic_loss.backward() self.critic_optimizer.step() return critic_loss.item() def actor_update(self, advantages, actions, mus): self.actor_optimizer.zero_grad() actor_loss = self.actor_loss(actions, mus) gradient_term = advantages * actor_loss gradient_term.backward() self.actor_optimizer.step() return actor_loss.item() def update(self, sample, prepare_state=None): actor_running_loss = [] critic_running_loss = [] for state, action, reward, next_state, done in sample: if prepare_state is not None: state = prepare_state(state) next_state = prepare_state(next_state) state = torch.tensor(state, dtype=torch.float32) next_state = torch.tensor(next_state, dtype=torch.float32) action = torch.tensor(action, dtype=torch.float32) # Update Critic V = self.critic_net.forward(state) V_target = torch.tensor([reward], dtype=torch.float32) if done is False: V_target += self.gamma * self.critic_net.forward(next_state) critic_loss = self.critic_update(V, V_target) critic_running_loss.append(critic_loss) # Update Actor advantage = (V_target - V).detach() mu = self.policy_net(state) actor_loss = self.actor_update(advantage, action, mu) actor_running_loss.append(actor_loss) return actor_running_loss, critic_running_loss def batch_update(self, sample, prepare_state=None): actor_running_loss = [] critic_running_loss = [] states, actions, rewards, next_states, dones = self.prepare_sample( sample) # Update Critic V = self.critic_net.forward(states) V_target = rewards + self.gamma * self.critic_net.forward( next_states) * (1 - dones) critic_loss = self.critic_update(V, V_target) critic_running_loss.append(critic_loss) # Update Actor advantage = (V_target - V).detach() mu = self.policy_net(states) actor_loss = self.actor_update(advantage, actions, mu) actor_running_loss.append(actor_loss) return actor_running_loss, critic_running_loss
class Agent(object): def __init__(self, n_states, n_actions, lr_actor, lr_critic, tau, gamma, mem_size, actor_l1_size, actor_l2_size, critic_l1_size, critic_l2_size, batch_size): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(mem_size, n_states, n_actions) self.batch_size = batch_size self.actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.target_actor = Actor(lr_actor, n_states, n_actions, actor_l1_size, actor_l2_size) self.target_critic = Critic(lr_critic, n_states, n_actions, critic_l1_size, critic_l2_size) self.noise = OUActionNoise(mu=np.zeros(n_actions), sigma=0.005) self.update_network_parameters(tau=1) def choose_action(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) # add noise to action - for exploration mu_prime = mu + torch.tensor(self.noise(), dtype=torch.float).to( self.actor.device) self.actor.train() return mu_prime.cpu().detach().numpy() def choose_action_no_train(self, observation): self.actor.eval() observation = torch.tensor(observation, dtype=torch.float).to(self.actor.device) mu = self.actor.forward(observation).to(self.actor.device) return mu.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): self.memory.push(state, action, reward, new_state, done) def learn(self): if self.memory.idx_last < self.batch_size: # not enough data in replay buffer return # select random events state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device) done = torch.tensor(done).to(self.critic.device) new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device) action = torch.tensor(action, dtype=torch.float).to(self.critic.device) state = torch.tensor(state, dtype=torch.float).to(self.critic.device) self.target_actor.eval() self.target_critic.eval() self.critic.eval() target_actions = self.target_actor.forward(new_state) critic_value_ = self.target_critic.forward(new_state, target_actions) critic_value = self.critic.forward(state, action) target = [] for j in range(self.batch_size): target.append(reward[j] + self.gamma * critic_value_[j] * done[j]) target = torch.tensor(target).to(self.critic.device) target = target.view(self.batch_size, 1) self.critic.train() self.critic.optimizer.zero_grad() critic_loss = F.mse_loss(target, critic_value) critic_loss.backward() self.critic.optimizer.step() self.critic.eval() self.actor.optimizer.zero_grad() mu = self.actor.forward(state) self.actor.train() actor_loss = -self.critic.forward(state, mu) actor_loss = torch.mean(actor_loss) actor_loss.backward() self.actor.optimizer.step() self.update_network_parameters() def update_network_parameters(self, tau=None): if tau is None: tau = self.tau actor_params = self.actor.named_parameters() critic_params = self.critic.named_parameters() target_actor_params = self.target_actor.named_parameters() target_critic_params = self.target_critic.named_parameters() critic_state_dict = dict(critic_params) actor_state_dict = dict(actor_params) target_critic_dict = dict(target_critic_params) target_actor_dict = dict(target_actor_params) for name in critic_state_dict: critic_state_dict[name] = tau*critic_state_dict[name].clone() + \ (1-tau)*target_critic_dict[name].clone() self.target_critic.load_state_dict(critic_state_dict) for name in actor_state_dict: actor_state_dict[name] = tau*actor_state_dict[name].clone() + \ (1-tau)*target_actor_dict[name].clone() self.target_actor.load_state_dict(actor_state_dict) def save_models(self): timestamp = time.strftime("%Y%m%d-%H%M%S") self.actor.save("actor_" + timestamp) self.target_actor.save("target_actor_" + timestamp) self.critic.save("critic_" + timestamp) self.target_critic.save("target_critic_" + timestamp) def load_models(self, fn_actor, fn_target_actor, fn_critic, fn_target_critic): self.actor.load_checkpoint(fn_actor) self.target_actor.load_checkpoint(fn_target_actor) self.critic.load_checkpoint(fn_critic) self.target_critic.load_checkpoint(fn_target_critic)
class TD3(object): """Agent class that handles the training of the networks and provides outputs as actions. Args: state_dim (array): state size action_dim (array): action size policy_noise (float): how much noise to add to actions device (device): cuda or cpu to process the tensors discount (float): discount factor tau (float): soft update for main networks to target networks policy_noise (float): noise factor noise_clip (float): clip factor policy_freq (int): frequency of policy updates """ def __init__(self, state_dim, action_dim, max_action, discount, tau, policy_noise, noise_clip, policy_freq, device): self.state_dim = len(state_dim[0]) self.action_dim = len(action_dim) self.max_action = max_action[2] self.actor = Actor(self.state_dim, self.action_dim, self.max_action).to(device) self.actor_target = copy.deepcopy(self.actor).float() # self.actor_target = Actor(state_dim, action_dim, self.max_action).to(device) # self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) # or 1e-3 self.critic = Critic(self.state_dim, self.action_dim).to(device) self.critic_target = copy.deepcopy(self.critic).float() # self.critic_target = Critic(state_dim, action_dim).to(device) # self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) # or 1e-2 self.device = device self.max_action = max_action self.discount = discount self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 def select_action(self, state): """Select an appropriate action from the agent policy Args: state (array): current state of environment Returns: action (float): action clipped within action range """ state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) # if noise != 0: # action_dim = len(self.env.action_space()) # action = (action + np.random.normal(0, noise, size=action_dim)) # action_space_low, _, action_space_high = self.env.action_domain() # return action.clip(action_space_low, action_space_high) return self.actor(state).cpu().data.numpy().flatten() def train(self, replay_buffer, batch_size=100): """Train and update actor and critic networks Args: replay_buffer (ReplayBuffer): buffer for experience replay batch_size(int): batch size to sample from replay buffer Return: actor_loss (float): loss from actor network critic_loss (float): loss from critic network """ self.total_it += 1 # Sample replay buffer state, next_state, action, reward, done = replay_buffer.sample( batch_size) state = torch.from_numpy( np.asarray([np.array(i.item().values()) for i in state])) next_state = np.asarray( [np.array(i.item().values()) for i in next_state]) reward = torch.as_tensor(reward, dtype=torch.float32) done = torch.as_tensor(done, dtype=torch.float32) with torch.no_grad(): # select an action according to the policy an add clipped noise # need to select set of actions noise = (torch.rand_like(torch.from_numpy(action)) * self.policy_noise).clamp(-self.noise_clip, self.noise_clip) next_action = (self.actor_target( torch.tensor(next_state, dtype=torch.float32)) + torch.tensor(noise, dtype=torch.float32)).clamp( self.max_action[0], self.max_action[2]) # next_action_d =torch.as_tensor(next_action, dtype=torch.double) # Compute the target Q value target_Q1, target_Q2 = self.critic(state, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + done * self.discount * target_Q # update action datatype, can't do earlier, use np.array earlier action = torch.as_tensor(action, dtype=torch.float32) # get current Q estimates current_Q1, current_Q2 = self.critic(state, action) # compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q[:1, :].transpose( 0, 1)) + F.mse_loss(current_Q2, target_Q[:1, :].transpose(0, 1)) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # delayed policy updates if self.total_it % self.policy_freq == 0: # compute the actor loss actor_loss = -self.critic.get_q(state, self.actor(state)).mean() # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename="best_avg", directory="./saves"): self.actor.load_state_dict( torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict( torch.load('%s/%s_critic.pth' % (directory, filename)))
class TD3: def __init__(self, device, state_dim, action_dim, action_max, gamma=0.99, tau=0.005, lr=3e-4, policy_noise=0.2, noise_clip=0.5, exploration_noise=0.1, policy_freq=2): self.actor = Actor(state_dim, 256, action_dim, action_max).to(device) self.target_actor = copy.deepcopy(self.actor) self.actor_optimizer = optim.Adam(params=self.actor.parameters(), lr=lr) self.critic = Critic(state_dim, 256, action_dim).to(device) self.target_critic = copy.deepcopy(self.critic) self.critic_optimizer = optim.Adam(params=self.critic.parameters(), lr=lr) self.device = device self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.rollout_actor = TD3RolloutActor(state_dim, action_dim, action_max, exploration_noise) self.sync_rollout_actor() self.iteration_num = 0 def train(self, replay_buffer, batch_size=256): self.iteration_num += 1 st, nx_st, ac, rw, mask = replay_buffer.sample(batch_size) with torch.no_grad(): noise = (torch.randn_like(ac) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) nx_ac = self.target_actor.forward(nx_st, noise) target_q1, target_q2 = self.target_critic.forward(nx_st, nx_ac) min_q = torch.min(target_q1, target_q2) target_q = rw + mask * self.gamma * min_q q1, q2 = self.critic.forward(st, ac) critic_loss = F.mse_loss(q1, target_q) + F.mse_loss(q2, target_q) self.critic.zero_grad() critic_loss.backward() self.critic_optimizer.step() if self.iteration_num % self.policy_freq == 0: actor_loss = -self.critic.q1(st, self.actor.forward(st)).mean() self.actor.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.sync_rollout_actor() def sync_rollout_actor(self): for param, target_param in zip(self.actor.parameters(), self.rollout_actor.parameters()): target_param.data.copy_(param.data.cpu()) def save(self, path): torch.save(self.critic.state_dict(), os.path.join(path, 'critic.pth')) torch.save(self.target_critic.state_dict(), os.path.join(path, 'target_critic.pth')) torch.save(self.critic_optimizer.state_dict(), os.path.join(path, 'critic_optimizer.pth')) torch.save(self.actor.state_dict(), os.path.join(path, 'actor.pth')) torch.save(self.target_actor.state_dict(), os.path.join(path, 'target_actor.pth')) torch.save(self.actor_optimizer.state_dict(), os.path.join(path, 'actor_optimizer.pth')) def load(self, path): self.critic.load_state_dict( torch.load(os.path.join(path, 'critic.pth'))) self.target_critic.load_state_dict( torch.load(os.path.join(path, 'target_critic.pth'))) self.critic_optimizer.load_state_dict( torch.load(os.path.join(path, 'critic_optimizer.pth'))) self.actor.load_state_dict(torch.load(os.path.join(path, 'actor.pth'))) self.target_actor.load_state_dict( torch.load(os.path.join(path, 'target_actor.pth'))) self.actor_optimizer.load_state_dict( torch.load(os.path.join(path, 'actor_optimizer.pth'))) self.sync_rollout_actor()
class TD3(object): """Agent class that handles the training of the networks and provides outputs as actions. """ def __init__(self): state_dim = cons.STATE_DIM.flatten().shape[0] action_dim = cons.ACTION_DIM self.actor = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE) self.actor_target = Actor(state_dim, action_dim, cons.MAX_ACTION).to(cons.DEVICE) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=3e-4) # or 1e-3 self.critic = Critic(state_dim, action_dim).to(cons.DEVICE) self.critic_target = Critic(state_dim, action_dim).to(cons.DEVICE) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=3e-4) # or 1e-3 self.total_it = 0 self.critic_loss_plot = [] self.actor_loss_plot = [] def select_action(self, state, noise=cons.POLICY_NOISE): """Select an appropriate action from the agent policy Args: state (array): current state of environment noise (float): how much noise to add to actions Returns: action (list): nn action results """ state = torch.FloatTensor(state).to(cons.DEVICE) action = self.actor(state) # action space noise introduces noise to change the likelihoods of each action the agent might take if noise != 0: # creates tensor of gaussian noise noise = torch.clamp(torch.randn(14, dtype=torch.float32, device='cuda') * noise, min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP) action = action + noise torch.clamp(action, min=cons.MIN_ACTION, max=cons.MAX_ACTION) return action def train(self, replay_buffer, iterations): """Train and update actor and critic networks Args: replay_buffer (ReplayBuffer): buffer for experience replay iterations (int): how many times to run training Return: actor_loss (float): loss from actor network critic_loss (float): loss from critic network """ for it in range(iterations): self.total_it += 1 # keep track of the total training iterations # Sample replay buffer (priority replay) # choose type of replay if cons.PRIORITY: state, action, reward, next_state, done, weights, indexes = replay_buffer.sample(cons.BATCH_SIZE, beta=cons.BETA_SCHED.value(it)) else: state, action, reward, next_state, done = replay_buffer.sample(cons.BATCH_SIZE) state = torch.from_numpy(state).float().to(cons.DEVICE) # torch.Size([100, 14]) next_state = torch.from_numpy(next_state).float().to(cons.DEVICE) # torch.Size([100, 14]) action = torch.from_numpy(action).float().to(cons.DEVICE) # torch.Size([100, 14]) reward = torch.as_tensor(reward, dtype=torch.float32).to(cons.DEVICE) # torch.Size([100]) done = torch.as_tensor(done, dtype=torch.float32).to(cons.DEVICE) # torch.Size([100]) with torch.no_grad(): # select an action according to the policy and add clipped noise next_action = self.actor_target(next_state) noise = torch.clamp(torch.randn((100, 14), dtype=torch.float32, device='cuda') * cons.POLICY_NOISE, min=-cons.NOISE_CLIP, max=cons.NOISE_CLIP) next_action = torch.clamp((next_action + noise), min=cons.MIN_ACTION, max=cons.MAX_ACTION) # Compute the target Q value target_q1, target_q2 = self.critic(state.float(), next_action.float()) target_q = torch.min(target_q1, target_q2) gamma = torch.ones((100, 1), dtype=torch.float32, device='cuda') gamma = gamma.new_full((100, 1), cons.GAMMA) target_q = reward.unsqueeze(1) + (done.unsqueeze(1) * gamma * target_q).detach() # get current Q estimates current_q1, current_q2 = self.critic(state.float(), action.float()) # compute critic loss critic_loss = F.mse_loss(current_q1, target_q) + F.mse_loss(current_q2, target_q) cons.TD3_REPORT.write_critic_loss(self.total_it, it, critic_loss) # optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # using the minimum of the q values as the weight, use min to prevent overestimation if cons.PRIORITY: new_priorities = torch.flatten(torch.min(current_q1, current_q2)) # convert any negative priorities to a minimum value, can't have a negative priority new_priorities = torch.clamp(new_priorities, min=0.0000001).tolist() # convert to a list for storage replay_buffer.update_priorities(indexes, new_priorities) # delayed policy updates if it % cons.POLICY_FREQ == 0: # update the actor policy less frequently # compute the actor loss q_action = self.actor(state).float().detach() actor_loss = -self.critic.get_q(state, q_action).mean() cons.TD3_REPORT.write_actor_loss(self.total_it, it, actor_loss, 1) # optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.actor_loss_plot.append(actor_loss.item()) # Update the frozen right_target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(cons.TAU * param.data + (1 - cons.TAU) * target_param.data) def save(self, filename, directory): torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) def load(self, filename="best_avg", directory="td3/saves/shared_agent"): self.actor.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename))) self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename)))
class Agent: def __init__(self, state_size, action_size): self._state_size = state_size self._action_size = action_size # Actor network self._actor_local = Actor(state_size, action_size).to(device) self._actor_target = Actor(state_size, action_size).to(device) self._actor_optimizer = optim.Adam(self._actor_local.parameters()) # Critic network self._critic_local = Critic(state_size, action_size).to(device) self._critic_target = Critic(state_size, action_size).to(device) self._critic_optimizer = optim.Adam(self._critic_local.parameters()) # Memory self._memory = Memory(BUFFER_SIZE) # Do equal weights self.hard_update(self._actor_local, self._actor_target) self.hard_update(self._critic_local, self._critic_target) def step(self, state, action, reward, next_state, done): self._memory.push((state, action, reward, next_state, done)) if len(self._memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): samples = self._memory.sample(BATCH_SIZE) self.learn(samples) def act(self, state): state = torch.from_numpy(state).float().to(device) if binom.rvs(1, PROBABILITY_RAND_STEP): action = np.ndarray((1, ), buffer=np.array(uniform(-1, 1).rvs())) else: self._actor_local.eval() with torch.no_grad(): action = self._actor_local(state).cpu().data.numpy() self._actor_local.train() return np.clip(action, -1, 1) def hard_update(self, local, target): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(local_param.data) def soft_update(self, local, target, tau): for target_param, local_param in zip(target.parameters(), local.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data) def learn(self, samples): states, actions, rewards, next_states, dones = samples actions_next = self._actor_target(next_states) Q_targets_next = self._critic_target(next_states, actions_next) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) Q_expected = self._critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self._critic_optimizer.zero_grad() critic_loss.backward() self._critic_optimizer.step() actions_pred = self._actor_local(states) actor_loss = -self._critic_local(states, actions_pred).mean() self._actor_optimizer.zero_grad() actor_loss.backward() self._actor_optimizer.step() self.soft_update(self._critic_local, self._critic_target, TAU) self.soft_update(self._actor_local, self._actor_target, TAU) def save(self): torch.save(self._actor_local.state_dict(), ACTOR_PATH) torch.save(self._critic_local.state_dict(), CRITIC_PATH) def load(self): self._actor_local.load_state_dict(torch.load(ACTOR_PATH)) self._actor_local.eval() self._critic_local.load_state_dict(torch.load(CRITIC_PATH)) self._critic_local.eval()