def __init__(self, multihead_net: MultiheadNetwork, act_def: ContinuousDefinition, opts:SACAgentOptions=SACAgentOptions()): super().__init__() self.opts = opts self.act_def = act_def self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size, act_def) self.multihead_net = multihead_net self.multihead_net.init_network(self.opts.learning_rate) self.init_entropy() if not getattr(self.multihead_net, "sample"): raise("Network must implement 'sample' method")
def __init__(self, network, act_def: DiscreteDefinition, opts=DQAgentOpts()): super().__init__() self.network = network self.target_network = copy.deepcopy(network) polyak_update(self.target_network, self.network, 1) # Freeze target for p in self.target_network.parameters(): p.requires_grad = False self.opts = opts self.act_def = act_def self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size) self.exp_stack = StackedState(self.opts.exp_stack_size) self.epsilon = 1.0
def __init__(self, actor_network, critic_network, act_def: ContinuousDefinition, opts=DDPGAgentOptions()): ''' :param actor_network (nn.Module): Actor network :param critic_network (nn.Module): Critic network :param act_def (ContinuousDefinition): Action space definition :param opts: ''' super().__init__() self.opts = opts self.actor_network = actor_network self.critic_network = critic_network self.act_def = act_def self.target_actor_network = copy.deepcopy(actor_network) self.target_critic_network = copy.deepcopy(critic_network) # Freeze target networks. They will never be updated with gradient descent/ascent for p in self.target_critic_network.parameters(): p.requires_grad = False for p in self.target_actor_network.parameters(): p.requires_grad = False # Init target networks with the same parameters as the source networks polyak_update(self.target_actor_network, self.actor_network, 1) polyak_update(self.target_critic_network, self.critic_network, 1) self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size) # Initialize optimizer self.actor_optimizer = self.opts.actor_optimizer( self.actor_network.parameters(), self.opts.actor_learning_rate) self.critic_optimizer = self.opts.critic_optimizer( self.critic_network.parameters(), self.opts.critic_learning_rate) self.random_process = OrnsteinUhlenbeckProcess(size=2, theta=0.15, mu=0.0, sigma=0.1)
class DDPGAGent(Agent): def __init__(self, actor_network, critic_network, act_def: ContinuousDefinition, opts=DDPGAgentOptions()): ''' :param actor_network (nn.Module): Actor network :param critic_network (nn.Module): Critic network :param act_def (ContinuousDefinition): Action space definition :param opts: ''' super().__init__() self.opts = opts self.actor_network = actor_network self.critic_network = critic_network self.act_def = act_def self.target_actor_network = copy.deepcopy(actor_network) self.target_critic_network = copy.deepcopy(critic_network) # Freeze target networks. They will never be updated with gradient descent/ascent for p in self.target_critic_network.parameters(): p.requires_grad = False for p in self.target_actor_network.parameters(): p.requires_grad = False # Init target networks with the same parameters as the source networks polyak_update(self.target_actor_network, self.actor_network, 1) polyak_update(self.target_critic_network, self.critic_network, 1) self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size) # Initialize optimizer self.actor_optimizer = self.opts.actor_optimizer( self.actor_network.parameters(), self.opts.actor_learning_rate) self.critic_optimizer = self.opts.critic_optimizer( self.critic_network.parameters(), self.opts.critic_learning_rate) self.random_process = OrnsteinUhlenbeckProcess(size=2, theta=0.15, mu=0.0, sigma=0.1) def act(self, state, add_noise=False, uniform_noise=False): ''' Generates an action using the actor network. During the network, add some noise to ensure exploration. Addition of a noise is the off-policy nature of DDPG :param state: :param add_noise: :return: ''' if uniform_noise: action = np.random.uniform(self.act_def.lower_limit, self.act_def.upper_limit, size=self.act_def.shape) action = torch.tensor(action).float().to(self.actor_network.device) else: if type(state) is not torch.Tensor: state = torch.tensor(state).to( self.actor_network.device).float() bias = (self.act_def.upper_limit + self.act_def.lower_limit) / 2 bias = torch.tensor(bias).to(self.actor_network.device).float() bias.requires_grad = False scale = (self.act_def.upper_limit - self.act_def.lower_limit) / 2 #action = self.actor_network.forward(state)*scale + bias action = self.actor_network.forward(state) if add_noise: if self.opts.noise_epsilon > 0: #action = torch.add(action, torch.tensor(self.random_process.sample()).float().to(self.actor_network.device)) noise = torch.randn(action.shape, dtype=torch.float32) * math.sqrt( self.opts.noise_var) action = action + noise.to( self.actor_network.device).float() return action def learn(self, environment: Environment, n_episodes: int, n_iterations: int): avg_rewards = [] for i in range(n_episodes): n_update_iter = 0 # Number of update iterations done. Needed to check if target networks need update curr_state = torch.tensor(environment.reset()).to( device=self.actor_network.device).float() episode_rewards = [] while True: uniform_noise = False if n_update_iter < self.opts.uniform_noise_steps: # Select a random action for early exploration uniform_noise = True action = self.act( curr_state, add_noise=True, uniform_noise=uniform_noise).cpu().detach().numpy() next_state, reward, done, _ = environment.step(action) episode_rewards.append(reward) self.exp_buffer.add_experience( curr_state, torch.tensor(action).float(), torch.tensor(reward).float(), torch.tensor(next_state).float(), torch.tensor(done)) curr_state = torch.tensor(next_state).float().to( self.actor_network.device) curr_state.requires_grad = False self.opts.noise_epsilon = self.opts.noise_epsilon - self.opts.noise_depsilon if done: self.reset() total_episode_reward = np.array(episode_rewards).sum() avg_rewards.append(total_episode_reward) print( "({}/{}) - End of episode with total reward: {} iteration: {}" .format(i, n_episodes, total_episode_reward, n_update_iter)) break if self.exp_buffer.is_accumulated(): # Do the updates # Sample experiences #self.critic_network.eval() s_states, s_actions, s_rewards, s_next_states, s_done =\ self.exp_buffer.sample_tensor(self.opts.exp_batch_size, device=self.actor_network.device, dtype=torch.float32) critic = self.critic_network.forward( s_states, s_actions.detach()) target_actions = self.target_actor_network.forward( s_next_states) target_critics = self.target_critic_network.forward( s_next_states, target_actions) target = s_rewards.view(-1, 1) + self.opts.discount * ( 1 - s_done.view(-1, 1)) * target_critics # Run Gradient Descent on critic network self.critic_optimizer.zero_grad() #self.critic_network.train() # Enable train mode critic_loss = torch.nn.functional.mse_loss(critic, target) critic_loss.backward() self.critic_optimizer.step() # Run Gradient Ascent on actor network self.actor_optimizer.zero_grad() self.actor_network.train() # Enable train mode actor_out = self.act(s_states) actor_loss = -self.critic_network(s_states.detach(), actor_out) actor_loss = actor_loss.mean() actor_loss.backward() self.actor_optimizer.step() #print(self.actor_network.fc3.weight.grad.mean()) self.update_target_networks(0.01) n_update_iter += 1 # One iteration is complete return avg_rewards def reset(self): self.random_process.reset_states() def save_model(self, PATH): torch.save(self.actor_network.state_dict(), PATH + "_actor") torch.save(self.critic_network.state_dict(), PATH + "_critic") def load_model(self, PATH): self.actor_network.load_state_dict(torch.load(PATH + "_actor")) self.critic_network.load_state_dict(torch.load(PATH + "_critic")) def update_target_networks(self, p): polyak_update(self.target_actor_network, self.actor_network, p) polyak_update(self.target_critic_network, self.critic_network, p)
from Agents.ExperienceBuffer import ExperienceBuffer exp = ExperienceBuffer(5) exp.add_experience(0, 1, 4, 4, 0) exp.add_experience(0, 2, 4, 5, 0) exp.add_experience(0, 3, 4, 6, 0) exp.add_experience(0, 4, 4, 7, 1) states, actions, rewards, next_states, done = exp.sample(2) for i in range(2): print("Sampled exp: {}, {}, {}, {}, {}".format(states[i], actions[i], rewards[i], next_states[i], done[i])) import torch import math import numpy as np t = torch.Tensor([1, 2]) noise = torch.randn(t.shape, dtype=torch.float32) uni = np.random.uniform(-9.8, 9.8, size=(10, 1)) print(uni)
class SACAgent(Agent): ''' Implements a Soft Actor-Critic agent ''' def __init__(self, multihead_net: MultiheadNetwork, act_def: ContinuousDefinition, opts:SACAgentOptions=SACAgentOptions()): super().__init__() self.opts = opts self.act_def = act_def self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size, act_def) self.multihead_net = multihead_net self.multihead_net.init_network(self.opts.learning_rate) self.init_entropy() if not getattr(self.multihead_net, "sample"): raise("Network must implement 'sample' method") def init_entropy(self): if self.opts.auto_entropy: device = "cpu" if self.opts.use_gpu and torch.cuda.is_available(): device = "cuda:0" self.entropy = torch.zeros(1, dtype=torch.float32, requires_grad=True, device=device) self.entropy_optimizer = optim.Adam([self.entropy], self.opts.auto_entrop_lr) else: self.entropy = self.opts.entropy_scale def act_cluster(self, new_state, n_episode): # Classify new_state index = self.exp_buffer.classify(new_state, self.opts.update_cluster_scale) # Get the action of the belonging cluster cluster = self.exp_buffer.clusters[index] # Calculate next action if n_episode < self.opts.n_episodes_exploring_least_acts: # Exploring with using least used actions action = cluster.generate_action(self.act_def) else: # Using acitons with better rewards action = cluster.generate_action_reward(self.act_def) self.exp_buffer.last_cluster_id = index # Just in case return action def act(self, new_state, evaluation=False): new_state = self.multihead_net.feature_extraction(new_state) # TODO: For now evaluation and no evaluation same. if not evaluation: #new_state = torch.from_numpy(new_state).to(device).float().unsqueeze(0) with torch.no_grad(): action, _, _ = self.multihead_net.sample(new_state, add_noise=True) return action.squeeze(0).detach().cpu().numpy() else: #new_state = torch.from_numpy(new_state).to(device).float().unsqueeze(0) with torch.no_grad(): action , _, _ = self.multihead_net.sample(new_state, add_noise=False) return action.squeeze(0).detach().cpu().numpy() def update_params(self, n_iter, device): # Learn if enough data is accumulated if self.exp_buffer.is_accumulated(self.opts.exp_batch_size): if(self.opts.clustering and len(self.exp_buffer.clusters) == 0): return if self.multihead_net.base_optimizer is not None: self.multihead_net.base_optimizer.zero_grad() # Sample from buffer s_states, s_actions, s_rewards, s_next_states, s_done =\ self.exp_buffer.sample_tensor(self.opts.exp_batch_size, device, torch.float) features = self.multihead_net(s_states) # Target Values with torch.no_grad(): target_features = self.multihead_net(s_next_states) next_actions, log_probs, _ = self.multihead_net.sample(target_features, add_noise=True) critic_1_target, critic_2_target = self.multihead_net.get_target_critics(target_features, next_actions) critic_target = torch.min(critic_1_target, critic_2_target) target_value = critic_target - self.opts.entropy_scale*log_probs target_value = (target_value*(1-s_done.view(-1,1))) q_hat = s_rewards.view(-1,1) + self.opts.discount*target_value # Optimize Critic self.multihead_net.critic_optimizer.zero_grad() critic_1, critic_2 = self.multihead_net.get_critics(features, s_actions) critic_loss_1 = F.mse_loss(critic_1, q_hat.detach()) critic_loss_2 = F.mse_loss(critic_2, q_hat.detach()) critic_loss = critic_loss_1 + critic_loss_2 critic_loss.backward(retain_graph = True) self.multihead_net.critic_optimizer.step() # Optimize Policy self.multihead_net.policy_optimizer.zero_grad() # Calculate critic values for value and policy using the actions sampled from the current policy actions, log_probs, _ = self.multihead_net.sample(features, add_noise=True) critic_1_curr, critic_2_curr = self.multihead_net.get_critics(features, actions) critic_curr = torch.min(critic_1_curr, critic_2_curr) actor_loss = (self.opts.entropy_scale*log_probs - critic_curr).mean() actor_loss.backward() self.multihead_net.policy_optimizer.step() if self.multihead_net.base_net is not None: self.multihead_net.base_optimizer.step() if n_iter % 2500 == 0: print("Critic Loss 1: {} - Critic Loss 2: {} - Actor Loss: {}".format(critic_loss_1.item(), critic_loss_2.item(), actor_loss.item())) if n_iter % 1 == 0: self.multihead_net.update_targets(self.opts.tau) def load_checkpoint(self, checkpoint): all_rewards = [] avg_rewards = [] if checkpoint is not None: filepath = checkpoint reward_path = os.path.dirname(filepath) reward_path = os.path.join(reward_path, "rewards") self.load_model(filepath) with open(reward_path, 'r') as reward_file: reward_dict = json.load(reward_file) all_rewards = reward_dict['Rewards'] avg_rewards = reward_dict['Averages'] return all_rewards, avg_rewards def learn(self, env:Environment, trnOpts: TrainOpts): device = "cpu" if self.opts.use_gpu and torch.cuda.is_available(): device = "cuda:0" # Load checkpoint all_rewards, avg_rewards = self.load_checkpoint(trnOpts.checkpoint) self.multihead_net.to(device) n_iter = 0 e = 0 max_episodes = trnOpts.n_episodes max_steps = trnOpts.n_iterations while e < max_episodes: # Looping episodes if max_steps > 0 and n_iter > max_steps: break curr_state = env.reset() if type(curr_state) is not torch.Tensor: curr_state = torch.from_numpy(curr_state).to(device).float() curr_state = curr_state.unsqueeze(0) episode_rewards = [] step = 0 while True: n_iter += 1 step += 1 # Collect experience # e < self.opts.n_episodes_exploring => This can be added too clustering = self.opts.clustering and e < self.opts.n_episodes_exploring and len(self.exp_buffer.clusters) > 0 # Cluster count being higher than 0 means that clustering has been done with torch.no_grad(): if clustering: action = self.act_cluster(curr_state, e) else: action = self.act(curr_state, device) next_state, reward, done, _ = env.step(action) if self.opts.render: env.render() episode_rewards.append(reward) if clustering: self.exp_buffer.clusters[self.exp_buffer.last_cluster_id].add_action(action, reward) # Check Clustering if self.opts.clustering and len(self.exp_buffer) > self.opts.cluster_samples \ and len(self.exp_buffer.clusters) == 0: # It means that clustering already done print("Clustering") self.exp_buffer.cluster(self.opts.n_clusters, self.opts.use_elbow_plot) if type(next_state) is not torch.Tensor: next_state = torch.from_numpy(next_state).to(device).float() next_state = next_state.unsqueeze(0) self.exp_buffer.add_experience(curr_state.detach().cpu().squeeze(0), action, reward, next_state.detach().cpu().squeeze(0), done) if done: if not self.exp_buffer.is_accumulated(self.opts.exp_batch_size) or (self.opts.clustering and len(self.exp_buffer.states) < self.opts.cluster_samples): print("Accumulating buffer iteration: {}".format(n_iter)) else: episode_end_reward = np.array(episode_rewards).sum() all_rewards.append(episode_end_reward) e += 1 # current filepdate episode avg_reward = np.mean(all_rewards[-100:]) avg_rewards.append(avg_reward) print("({}/{}) - End of episode with total reward: {} - Avg Reward: {} Total Iter: {}".format(e, max_episodes, episode_end_reward, avg_reward, step)) break curr_state = next_state # Learn if enough data is accumulated if self.exp_buffer.is_accumulated(self.opts.exp_batch_size): #self.update_params(n_iter, device) start = time.time() self.update_params(n_iter, device) end = time.time() print("Elapsed :{}".format(end-start)) if n_iter > 0 and self.opts.save_frequency > 0 and n_iter % self.opts.save_frequency == 0: print("Saving at iteration {}".format(n_iter)) path = os.path.join(trnOpts.save_path, time.strftime("%Y%m%d-%H%M%S")) self.save_model(path) self.save_rewards(path, all_rewards, avg_rewards) return all_rewards, avg_rewards def save_model(self, PATH): # Check Path try: if not os.path.exists(PATH): os.mkdir(PATH) torch.save(self.multihead_net.state_dict(), os.path.join(PATH, "multihead")) except: print("Couldn't save model") def save_rewards(self, PATH, all_rewards, avg_rewards): try: if not os.path.exists(PATH): os.mkdir(PATH) torch.save(self.multihead_net.state_dict(), os.path.join(PATH, "multihead")) info = {"Rewards":all_rewards, "Averages":avg_rewards} with open(os.path.join(PATH, "rewards"), 'w') as fp: json.dump(info, fp) except: print("Couldn't save rewards") def load_model(self, PATH): state_dict = torch.load(PATH) self.multihead_net.load_state_dict(state_dict) def reset(): pass
class DQAgent(Agent): def __init__(self, network, act_def: DiscreteDefinition, opts=DQAgentOpts()): super().__init__() self.network = network self.target_network = copy.deepcopy(network) polyak_update(self.target_network, self.network, 1) # Freeze target for p in self.target_network.parameters(): p.requires_grad = False self.opts = opts self.act_def = act_def self.exp_buffer = ExperienceBuffer(self.opts.exp_buffer_size) self.exp_stack = StackedState(self.opts.exp_stack_size) self.epsilon = 1.0 def act(self, new_state, device="cpu"): with torch.no_grad(): if np.random.random() < self.epsilon: action = random.randint(0, len(self.act_def.samples)-1) else: action = np.argmax(self.network(torch.tensor(new_state).float().to(device).unsqueeze(0)).detach().cpu().numpy()) return action def act_greedy(self, new_state): with torch.no_grad(): net_out = self.network(new_state.unsqueeze(0)) action = np.argmax(net_out.detach().cpu().numpy()) return action def learn(self, env: Environment, max_episodes: int, max_steps: int): device = "cpu" if self.opts.use_gpu and torch.cuda.is_available(): device = "cuda:0" self.network.to(device) self.target_network.to(device) self.reset() total_steps = 0 optimizer = self.opts.optimizer(self.network.parameters(), self.opts.learning_rate) avg_rewards = [] losses = [] learning_complete = False episodes_passed = 0 while not learning_complete: current_step = 0 target_update_iter = 0 episode_rewards = [] curr_state = env.reset() action = 0 if self.opts.use_exp_stack: curr_state = self.exp_stack.add_and_get(curr_state) #curr_state = torch.tensor(curr_state).to(device).float() if episodes_passed > max_episodes: learning_complete = True break while True: done = 0 with torch.no_grad(): # Just collecting experience for i in range(self.opts.exp_stack_size-1): action = self.act(curr_state, device) next_state, reward, done, _ = env.step(self.act_def[action]) self.exp_stack.add_state(next_state) total_steps += 1 # Doesn't reset next_state = self.exp_stack.get_stacked_states() episode_rewards.append(reward) self.exp_buffer.add_experience(curr_state, action, reward, next_state, done) curr_state = next_state if self.opts.render: env.render() if done or current_step > max_steps: self.reset() total_episode_reward = np.array(episode_rewards).sum() avg_rewards.append(total_episode_reward) print("({}/{}) - End of episode with total reward: {} iteration: {} Memory Size: {}".format(episodes_passed, max_episodes, total_episode_reward, current_step, len(self.exp_buffer))) break if self.exp_buffer.is_accumulated(): s_states, s_actions, s_rewards, s_next_states, s_done =\ self.exp_buffer.sample_numpy(self.opts.exp_batch_size) # TODO: n-step Q-learning optimizer.zero_grad() with torch.no_grad(): s_next_states = torch.from_numpy(s_next_states).to(device).float() s_done = torch.from_numpy(s_done).to(device).float() s_rewards = torch.from_numpy(s_rewards).to(device).float() next_state_vals = self.target_network(s_next_states)*(1-s_done.view(-1,1)) # Terminal states has V(s) = 0. That is why we use s_done next_state_vals = next_state_vals*self.opts.discount # Discount the reward td_target = s_rewards + next_state_vals.max(1)[0].detach() # In TD target, use target network (see Double Q learning) #loss = -self.opts.loss(td_target, self.network(s_states)) s_states = torch.from_numpy(s_states).to(device).float() s_actions = torch.from_numpy(s_actions).to(device).to(torch.int64) curr_state_estimations = self.network(s_states).gather(1, s_actions.view(-1,1)) loss = torch.nn.functional.mse_loss(curr_state_estimations, td_target.unsqueeze(1)) loss.backward() optimizer.step() target_update_iter += 1 losses.append(loss.item()) # Update target network if target_update_iter > self.opts.target_update_freq: target_update_iter = 0 polyak_update(self.target_network, self.network, 1) print("Update target at step {}".format(total_steps)) if self.opts.verbose and total_steps%self.opts.verbose_frequency == 0 and len(losses) > 0: print("Total Steps:{} - Loss:{} - Curr Epsilon:{}".format(total_steps, losses[-1], self.epsilon)) current_step += 1 # Resets every episode if self.exp_buffer.is_accumulated(): episodes_passed += 1 # Increment episode only if enough experience is collected self.epsilon = self.opts.min_epsilon + (self.opts.max_epsilon - self.opts.min_epsilon)*np.exp(-1.0*episodes_passed/self.opts.epsilon_decay) return avg_rewards, losses def save_model(self, PATH): torch.save(self.network.state_dict(), PATH) def load_model(self, PATH): self.network.load_state_dict(torch.load(PATH)) def reset(self): self.exp_stack.reset()