class DecoupledA3CAgent: def __init__(self, env, gamma, lr, global_max_episode): self.env = env self.gamma = gamma self.lr = lr self.global_episode = mp.Value('i', 0) self.GLOBAL_MAX_EPISODE = global_max_episode self.global_value_network = ValueNetwork( self.env.observation_space.shape[0], 1) self.global_value_network.share_memory() self.global_policy_network = PolicyNetwork( self.env.observation_space.shape[0], self.env.action_space.n) self.global_policy_network.share_memory() self.global_value_optimizer = optim.Adam( self.global_value_network.parameters(), lr=lr) self.global_policy_optimizer = optim.Adam( self.global_policy_network.parameters(), lr=lr) self.workers = [DecoupledWorker(i, env, self.gamma, self.global_value_network, self.global_policy_network,\ self.global_value_optimizer, self.global_policy_optimizer, self.global_episode, self.GLOBAL_MAX_EPISODE) for i in range(mp.cpu_count())] def train(self): print("Training on {} cores".format(mp.cpu_count())) input("Enter to start") [worker.start() for worker in self.workers] [worker.join() for worker in self.workers] def save_model(self): torch.save(self.global_value_network.state_dict(), "a3c_value_model.pth") torch.save(self.global_policy_network.state_dict(), "a3c_policy_model.pth")
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class DecoupledWorker(mp.Process): def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global() def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.local_policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.local_value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.local_policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update_global(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.global_value_optimizer.zero_grad() value_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_value_network.parameters(), self.global_value_network.parameters()): global_params._grad = local_params._grad self.global_value_optimizer.step() self.global_policy_optimizer.zero_grad() policy_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_policy_network.parameters(), self.global_policy_network.parameters()): global_params._grad = local_params._grad #print(global_params._grad) self.global_policy_optimizer.step() def sync_with_global(self): self.local_value_network.load_state_dict( self.global_value_network.state_dict()) self.local_policy_network.load_state_dict( self.global_policy_network.state_dict()) def run(self): state = self.env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 while self.global_episode.value < self.GLOBAL_MAX_EPISODE: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: with self.global_episode.get_lock(): self.global_episode.value += 1 print(self.name + " | episode: " + str(self.global_episode.value) + " " + str(episode_reward)) self.update_global(trajectory) self.sync_with_global() trajectory = [] episode_reward = 0 state = self.env.reset() else: state = next_state
class OldSACAgent: def __init__(self, env, render, config_info): self.env = env self.render = render self._reset_env() # Create run folder to store parameters, figures, and tensorboard logs self.path_runs = create_run_folder(config_info) # Extract training parameters from yaml config file param = load_training_parameters(config_info["config_param"]) self.train_param = param["training"] # Define device self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print(f"Device in use : {self.device}") # Define state and action dimension spaces state_dim = env.observation_space.shape[0] num_actions = env.action_space.shape[0] # Define models hidden_size = param["model"]["hidden_size"] self.q_net = QNetwork(state_dim, num_actions, hidden_size).to(self.device) self.v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net = VNetwork(state_dim, hidden_size).to(self.device) self.target_v_net.load_state_dict(self.v_net.state_dict()) self.policy_net = PolicyNetwork(state_dim, num_actions, hidden_size).to( self.device ) # Define loss criterion self.q_criterion = nn.MSELoss() self.v_criterion = nn.MSELoss() # Define optimizers lr = float(param["optimizer"]["learning_rate"]) self.q_opt = optim.Adam(self.q_net.parameters(), lr=lr) self.v_opt = optim.Adam(self.v_net.parameters(), lr=lr) self.policy_opt = optim.Adam(self.policy_net.parameters(), lr=lr) # Initialize replay buffer self.replay_buffer = ReplayBuffer(param["training"]["replay_size"]) self.transition = namedtuple( "transition", field_names=["state", "action", "reward", "done", "next_state"], ) # Useful variables self.batch_size = param["training"]["batch_size"] self.gamma = param["training"]["gamma"] self.tau = param["training"]["tau"] self.start_step = param["training"]["start_step"] self.max_timesteps = param["training"]["max_timesteps"] self.alpha = param["training"]["alpha"] def _reset_env(self): # Reset the environment and initialize episode reward self.state, self.done = self.env.reset(), False self.episode_reward = 0.0 self.episode_step = 0 def train(self): # Main training loop total_timestep = 0 all_episode_rewards = [] all_mean_rewards = [] update = 0 # Create tensorboard writer writer = SummaryWriter(log_dir=self.path_runs, comment="-sac") for episode in itertools.count(1, 1): self._reset_env() while not self.done: # trick to improve exploration at the start of training if self.start_step > total_timestep: action = self.env.action_space.sample() # Sample random action else: action = self.policy_net.get_action( self.state, self.device ) # Sample action from policy # Fill the replay buffer up with transitions if len(self.replay_buffer) > self.batch_size: batch = self.replay_buffer.sample_buffer(self.batch_size) # Update parameters of all the networks q_loss, v_loss, policy_loss = self.train_on_batch(batch) writer.add_scalar("loss/q", q_loss, update) writer.add_scalar("loss/v", v_loss, update) writer.add_scalar("loss/policy", policy_loss, update) update += 1 if self.render: self.env.render() # Perform one step in the environment next_state, reward, self.done, _ = self.env.step(action) total_timestep += 1 self.episode_step += 1 self.episode_reward += reward # Create a tuple for the new transition new_transition = self.transition( self.state, action, reward, self.done, next_state ) # Append transition to the replay buffer self.replay_buffer.store_transition(new_transition) self.state = next_state if total_timestep > self.max_timesteps: break mean_reward = np.mean(all_episode_rewards[-100:]) all_episode_rewards.append(self.episode_reward) all_mean_rewards.append(mean_reward) print( "Episode n°{} ; total timestep [{}/{}] ; episode steps {} ; " "reward {} ; mean reward {}".format( episode, total_timestep, self.max_timesteps, self.episode_step, round(self.episode_reward, 2), round(mean_reward, 2), ) ) writer.add_scalar("reward", self.episode_reward, episode) writer.add_scalar("mean reward", mean_reward, episode) # Save networks' weights path_critic = os.path.join(self.path_runs, "critic.pth") path_actor = os.path.join(self.path_runs, "actor.pth") torch.save(self.q_net.state_dict(), path_critic) torch.save(self.policy_net.state_dict(), path_actor) # Plot reward self.plot_reward(all_episode_rewards, all_mean_rewards) # Close all writer.close() self.env.close() def train_on_batch(self, batch_samples): # Unpack batch_size of transitions randomly drawn from the replay buffer ( state_batch, action_batch, reward_batch, done_int_batch, next_state_batch, ) = batch_samples # Transform np arrays into tensors and send them to device state_batch = torch.tensor(state_batch).to(self.device) next_state_batch = torch.tensor(next_state_batch).to(self.device) action_batch = torch.tensor(action_batch).to(self.device) reward_batch = torch.tensor(reward_batch).unsqueeze(1).to(self.device) done_int_batch = torch.tensor(done_int_batch).unsqueeze(1).to(self.device) q_value, _ = self.q_net(state_batch, action_batch) value = self.v_net(state_batch) pi, log_pi = self.policy_net.sample(state_batch) ### Update Q target_next_value = self.target_v_net(next_state_batch) next_q_value = ( reward_batch + (1 - done_int_batch) * self.gamma * target_next_value ) q_loss = self.q_criterion(q_value, next_q_value.detach()) ### Update V q_pi, _ = self.q_net(state_batch, pi) next_value = q_pi - log_pi v_loss = self.v_criterion(value, next_value.detach()) ### Update policy log_pi_target = q_pi - value policy_loss = (log_pi * (log_pi - log_pi_target).detach()).mean() # Losses and optimizers self.q_opt.zero_grad() q_loss.backward() self.q_opt.step() self.v_opt.zero_grad() v_loss.backward() self.v_opt.step() self.policy_opt.zero_grad() policy_loss.backward() self.policy_opt.step() soft_update(self.target_v_net, self.v_net, self.tau) return q_loss.item(), v_loss.item(), policy_loss.item() def plot_reward(self, data, mean_data): plt.plot(data, label="reward") plt.plot(mean_data, label="mean reward") plt.xlabel("Episode") plt.ylabel("Reward") plt.title(f"Reward evolution for {self.env.unwrapped.spec.id} Gym environment") plt.tight_layout() plt.legend() path_fig = os.path.join(self.path_runs, "figure.png") plt.savefig(path_fig) print(f"Figure saved to {path_fig}") plt.show()
class SACAgent: def __init__(self, env, gamma, tau, alpha, q_lr, policy_lr, a_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [0, 250] self.obs_dim = env.state_dim self.action_dim = env.action_dim # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) # entropy temperature self.alpha = alpha self.target_entropy = -torch.prod( torch.Tensor([self.action_dim, 1]).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) next_q_target = torch.min(next_q1, next_q2) - self.alpha * next_log_pi expected_q = rewards + (1 - dones) * self.gamma * next_q_target # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update q networks self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # delayed update for policy network and target q networks new_actions, log_pi = self.policy_net.sample(states) if self.update_step % self.delay_step == 0: min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (self.alpha * log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) # update temperature alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() self.update_step += 1
class SACAgent(): def __init__(self, env: object, gamma: float, tau: float, buffer_maxlen: int, critic_lr: float, actor_lr: float, reward_scale: int): # Selecting the device to use, wheter CUDA (GPU) if available or CPU self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Creating the Gym environments for training and evaluation self.env = env # Get max and min values of the action of this environment self.action_range = [ self.env.action_space.low, self.env.action_space.high ] # Get dimension of of the state and the action self.obs_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.critic_lr = critic_lr self.actor_lr = actor_lr self.buffer_maxlen = buffer_maxlen self.reward_scale = reward_scale # Scaling and bias factor for the actions -> We need scaling of the actions because each environment has different min and max values of actions self.scale = (self.action_range[1] - self.action_range[0]) / 2.0 self.bias = (self.action_range[1] + self.action_range[0]) / 2.0 # initialize networks self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.target_q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy weight parameters to the target Q networks for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(param) # initialize optimizers self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=self.critic_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=self.critic_lr) self.policy_optimizer = optim.Adam(self.policy.parameters(), lr=self.actor_lr) # Create a replay buffer self.replay_buffer = BasicBuffer(self.buffer_maxlen) def update(self, batch_size: int): # Sampling experiences from the replay buffer states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # Convert numpy arrays of experience tuples into pytorch tensors states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = self.reward_scale * torch.FloatTensor(rewards).to( self.device) # in SAC we do reward scaling for the sampled rewards next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Critic update (computing the loss) # Please refer to equation (6) in the paper for details # Sample actions for the next states (s_t+1) using the current policy next_actions, next_log_pi, _, _ = self.policy.sample( next_states, self.scale) next_actions = self.rescale_action(next_actions) # Compute Q(s_t+1,a_t+1) by giving the states and actions to the Q network and choose the minimum from 2 target Q networks next_q1 = self.target_q_net1(next_states, next_actions) next_q2 = self.target_q_net2(next_states, next_actions) min_q = torch.min(next_q1, next_q2) # find minimum between next_q1 and next_q2 # Compute the next Q_target (Q(s_t,a_t)-alpha(next_log_pi)) next_q_target = (min_q - next_log_pi) # Compute the Q(s_t,a_t) using s_t and a_t from the replay buffer curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) # Find expected Q, i.e., r(t) + gamma*next_q_target expected_q = rewards + (1 - dones) * self.gamma * next_q_target # Compute loss between Q network and expected Q q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # Backpropagate the losses and update Q network parameters self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() # Policy update (computing the loss) # Sample new actions for the current states (s_t) using the current policy new_actions, log_pi, _, _ = self.policy.sample(states, self.scale) new_actions = self.rescale_action(new_actions) # Compute Q(s_t,a_t) and choose the minimum from 2 Q networks new_q1 = self.q_net1.forward(states, new_actions) new_q2 = self.q_net2.forward(states, new_actions) min_q = torch.min(new_q1, new_q2) # Compute the next policy loss, i.e., alpha*log_pi - Q(s_t,a_t) eq. (7) policy_loss = (log_pi - min_q).mean() # Backpropagate the losses and update policy network parameters self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # Updating target networks with soft update using update rate tau for target_param, param in zip(self.target_q_net1.parameters(), self.q_net1.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) for target_param, param in zip(self.target_q_net2.parameters(), self.q_net2.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) def get_action( self, state: np.ndarray, stochastic: bool) -> Tuple[np.ndarray, torch.Tensor, torch.Tensor]: # state: the state input to the pi network # stochastic: boolean (True -> use noisy action, False -> use noiseless (deterministic action)) state = torch.FloatTensor(state).unsqueeze(0).to(self.device) # Get mean and sigma from the policy network mean, log_std = self.policy.forward(state) std = log_std.exp() # Stochastic mode is used for training, non-stochastic mode is used for evaluation if stochastic: normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() else: normal = Normal(mean, 0) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() # return a rescaled action, and also the mean and standar deviation of the action # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value return self.rescale_action(action), mean, std def rescale_action(self, action: np.ndarray) -> np.ndarray: # we use a rescaled action since the output of the policy network is [-1,1] and the mujoco environments could be ranging from [-n,n] where n is an arbitrary real value # scale -> scalar multiplication # bias -> scalar offset return action * self.scale[0] + self.bias[0] def Actor_save(self, WORKSPACE: str): # save 각 node별 모델 저장 print("Save the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" torch.save(self.policy.state_dict(), savePath) def Actor_load(self, WORKSPACE: str): # save 각 node별 모델 로드 print("load the torch model") savePath = WORKSPACE + "./policy_model5_Hop_.pth" # Best self.policy = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy.load_state_dict(torch.load(savePath))
class DRTRPOAgent(): """ DR TRPO """ def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_adv_mc(self, trajectory): """ Compute the advantage of all (st,at) in trajectory. The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN) """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) advantages = value_targets - values return advantages, value_loss def compute_adv_td(self, state, next_state, reward): """ Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN """ state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) reward = torch.as_tensor(reward) state_value = self.value_network.forward(state) next_state_value = self.value_network.forward(next_state) value_target = reward + next_state_value advantage = value_target - state_value value_loss = F.mse_loss(state_value, value_target) return advantage, value_loss def compute_policy_loss_kl(self, state, state_adv, beta): """ Policy loss of DR TRPO (KL Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) denom = torch.sum(torch.exp(state_adv / beta) * pi_dist) new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom return F.mse_loss(pi_dist, new_pi_dist) def compute_policy_loss_wass(self, state, state_adv, beta): """ Policy loss of DR TRPO (Wasserstein Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) """Find argmax_j {A(s,aj) - β*d(aj,ai)}.""" best_j = [] for i in range(self.action_dim): opt_j = 0 opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i) for j in range(self.action_dim): cur_val = state_adv[j] - beta * self.compute_distance(j, i) if cur_val > opt_val: opt_j = j opt_val = cur_val best_j.append(opt_j) new_pi_dist = torch.zeros(self.action_dim) for j in range(self.action_dim): for i in range(self.action_dim): if j == best_j[i]: new_pi_dist[j] += pi_dist[i] return F.mse_loss(pi_dist, new_pi_dist) def compute_distance(self, a1, a2): if a1 == a2: return 0 else: return 1 def update(self, value_loss, policy_loss): self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class Agent(): def __init__(self, state_size, action_size, action_dim, config): self.state_size = state_size self.action_size = action_size self.action_dim = action_dim self.seed = 0 self.device = 'cuda' self.batch_size = config["batch_size"] self.lr = 0.005 self.gamma = 0.99 self.q_shift_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.q_shift_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_local = QNetwork(state_size, action_size, self.seed).to(self.device) self.Q_target = QNetwork(state_size, action_size, self.seed).to(self.device) self.R_local = RNetwork(state_size, action_size, self.seed).to(self.device) self.R_target = RNetwork(state_size, action_size, self.seed).to(self.device) self.policy = PolicyNetwork(state_size, action_size, self.seed).to(self.device) self.predicter = Classifier(state_size, action_dim, self.seed).to(self.device) #self.criterion = nn.CrossEntropyLoss() # optimizer self.optimizer_q_shift = optim.Adam(self.q_shift_local.parameters(), lr=self.lr) self.optimizer_q = optim.Adam(self.Q_local.parameters(), lr=self.lr) self.optimizer_r = optim.Adam(self.R_local.parameters(), lr=self.lr) self.optimizer_p = optim.Adam(self.policy.parameters(), lr=self.lr) self.optimizer_pre = optim.Adam(self.predicter.parameters(), lr=self.lr) pathname = "lr {} batch_size {} seed {}".format( self.lr, self.batch_size, self.seed) tensorboard_name = str(config["locexp"]) + '/runs/' + pathname self.writer = SummaryWriter(tensorboard_name) self.steps = 0 self.ratio = 1. / action_dim self.all_actions = [] for a in range(self.action_dim): action = torch.Tensor(1) * 0 + a self.all_actions.append(action.to(self.device)) def act(self, state): dis, action, log_probs, ent = self.policy.sample_action( torch.Tensor(state).unsqueeze(0)) return dis, action, log_probs, ent def learn(self, memory): states, next_states, actions = memory.expert_policy(self.batch_size) # actions = actions[0] # print("states ", states) self.state_action_frq(states, actions) self.get_action_prob(states, actions) self.compute_r_function(states, actions) return # compute difference between Q_shift and y_sh q_sh_value = self.q_shift_local(next_states, actions) y_sh = np.empty((self.batch_size, 1), dtype=np.float32) for idx, s in enumerate(next_states): q = [] for action in self.all_actions: q.append(Q_target(s.unsqueeze(0), action.unsqueeze(0))) q_max = max(q) np.copyto(y_sh[idx], q_max.detach().numpy()) y_sh = torch.Tensor(y_sh) y_sh *= self.gamma q_shift_loss = F.mse_loss(y_sh, q_shift_values) # Minimize the loss self.optimizer.zero_grad() q_shift_loss.backward() self.optimizer.step() #minimize MSE between pred Q and y = r'(s,a) + gama * max Q'(s',a) q_current = self.Q_local(states, actions) r_hat = self.R_target(states, actions) # use y_sh as target y_q = r_hat + y_sh q_loss = F.mse_loss(q_current, y_q) # Minimize the loss self.optimizer.zero_grad() q_loss.backward() self.optimizer.step() # get predicted reward r = self.R_local(states, actions) def state_action_frq(self, states, action): """ Train classifer to compute state action freq """ self.steps += 1 output = self.predicter(states) # create one hot encode y from actions y = action.type(torch.long) y = y.squeeze(1) loss = nn.CrossEntropyLoss()(output, y) self.optimizer_pre.zero_grad() loss.backward() self.optimizer_pre.step() self.writer.add_scalar('Predict_loss', loss, self.steps) def get_action_prob(self, states, actions, dim=False): """ """ if dim: output = self.predicter(states) action_prob = output.gather(1, actions.type(torch.long)) action_prob = torch.log(action_prob) return action_prob output = self.predicter(states) print("Output prob ", output) action_prob = output.gather(1, actions.type(torch.long)) print("action prob ", action_prob) action_prob = torch.log(action_prob) print("action prob ", action_prob) return action_prob def compute_r_function(self, states, actions): """ """ actions = actions.type(torch.float) y = self.R_local(states, actions) y_shift = self.q_shift_target(states, actions) y_r_part1 = self.get_action_prob(states, actions) - y_shift print("ratio ", self.ratio) # sum all other actions y_r_part2 = torch.empty((self.batch_size, 1), dtype=torch.float32) idx = 0 for a, s in zip(actions, states): y_h = 0 for b in self.all_actions: if torch.eq(a, b): continue print("diff ac ", b) r_hat = self.R_target(s.unsqueeze(0), b.unsqueeze(0)) n_b = self.get_action_prob(s.unsqueeze(0), b.unsqueeze(0), True) - self.q_shift_target( s.unsqueeze(0), b.unsqueeze(0)) y_h += (r_hat - n_b) y_h = self.ratio * y_h y_r_part2[idx] = y_h idx += 1 print("shape of r y ", y.shape) print("y r part 1 ", y_r_part1.shape) print("y r part 2 ", y_r_part2.shape)
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory, adv_method): """ When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = logits probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() # 0 for MC, 1 for TD if adv_method == 0: advantages = value_targets - values if adv_method == 1: advantages = rewards - values + self.gamma * torch.cat( (values[1:], torch.FloatTensor([[0]])), dim=0) policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantages.detach() policy_loss = policy_loss.sum() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory, adv_method): value_loss, policy_loss = self.compute_loss(trajectory, adv_method) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init") def get_action(self, state): if state.shape != self.img_size: print( f"Invalid size, expected shape {self.img_size}, got {state.shape}" ) return None inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to( self.device) features = self.feature_net(inp) features = features.view(-1, self.input_size) mean, log_std = self.policy_net.forward(features) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # states and next states are lists of ndarrays, np.stack converts them to # ndarrays of shape (batch_size, height, width, num_channels) states = np.stack(states) next_states = np.stack(next_states) states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).permute(0, 3, 1, 2).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Process images features = self.feature_net( states) #.contiguous() # Properly shaped due to batching next_features = self.feature_net(next_states) #.contiguous() features = torch.reshape(features, (64, self.input_size)) next_features = torch.reshape(next_features, (64, self.input_size)) next_actions, next_log_pi = self.policy_net.sample(next_features) next_q1 = self.q_net1(next_features, next_actions) next_q2 = self.q_net2(next_features, next_actions) next_v = self.target_value_net(next_features) next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(features) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss expected_q = rewards + (1 - dones) * self.gamma * next_v curr_q1 = self.q_net1.forward(features, actions) curr_q2 = self.q_net2.forward(features, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value and q networks self.value_optimizer.zero_grad() v_loss.backward(retain_graph=True) self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward(retain_graph=True) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward(retain_graph=True) self.q2_optimizer.step() # delayed update for policy network and target q networks if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(features) min_q = torch.min(self.q_net1.forward(features, new_actions), self.q_net2.forward(features, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward(retain_graph=True) self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1