class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.action_range = [env.action_space.low, env.action_space.high] self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.target_value_net = ValueNetwork(self.obs_dim, 1).to(self.device) self.q_net1 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.obs_dim, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.obs_dim, self.action_dim).to(self.device) # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) self.replay_buffer = BasicBuffer(buffer_maxlen) def get_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) mean, log_std = self.policy_net.forward(state) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) states = torch.FloatTensor(states).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) next_actions, next_log_pi = self.policy_net.sample(next_states) next_q1 = self.q_net1(next_states, next_actions) next_q2 = self.q_net2(next_states, next_actions) next_v = self.target_value_net(next_states) # value Loss next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(states) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss curr_q1 = self.q_net1.forward(states, actions) curr_q2 = self.q_net2.forward(states, actions) expected_q = rewards + (1 - dones) * self.gamma * next_v q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value network and q networks self.value_optimizer.zero_grad() v_loss.backward() self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward() self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward() self.q2_optimizer.step() #delayed update for policy net and target value nets if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(states) min_q = torch.min(self.q_net1.forward(states, new_actions), self.q_net2.forward(states, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1
class DecoupledWorker(mp.Process): def __init__(self, id, env, gamma, global_value_network, global_policy_network, global_value_optimizer, global_policy_optimizer, global_episode, GLOBAL_MAX_EPISODE): super(DecoupledWorker, self).__init__() self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.name = "w%i" % id self.env = env self.env.seed(id) self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.local_value_network = ValueNetwork(self.obs_dim, 1) self.local_policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.global_value_network = global_value_network self.global_policy_network = global_policy_network self.global_episode = global_episode self.global_value_optimizer = global_value_optimizer self.global_policy_optimizer = global_policy_optimizer self.GLOBAL_MAX_EPISODE = GLOBAL_MAX_EPISODE # sync local networks with global networks self.sync_with_global() def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.local_policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.local_value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.local_policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update_global(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.global_value_optimizer.zero_grad() value_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_value_network.parameters(), self.global_value_network.parameters()): global_params._grad = local_params._grad self.global_value_optimizer.step() self.global_policy_optimizer.zero_grad() policy_loss.backward() # propagate local gradients to global parameters for local_params, global_params in zip( self.local_policy_network.parameters(), self.global_policy_network.parameters()): global_params._grad = local_params._grad #print(global_params._grad) self.global_policy_optimizer.step() def sync_with_global(self): self.local_value_network.load_state_dict( self.global_value_network.state_dict()) self.local_policy_network.load_state_dict( self.global_policy_network.state_dict()) def run(self): state = self.env.reset() trajectory = [] # [[s, a, r, s', done], [], ...] episode_reward = 0 while self.global_episode.value < self.GLOBAL_MAX_EPISODE: action = self.get_action(state) next_state, reward, done, _ = self.env.step(action) trajectory.append([state, action, reward, next_state, done]) episode_reward += reward if done: with self.global_episode.get_lock(): self.global_episode.value += 1 print(self.name + " | episode: " + str(self.global_episode.value) + " " + str(episode_reward)) self.update_global(trajectory) self.sync_with_global() trajectory = [] episode_reward = 0 state = self.env.reset() else: state = next_state
class DRTRPOAgent(): """ DR TRPO """ def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_adv_mc(self, trajectory): """ Compute the advantage of all (st,at) in trajectory. The advantage is estimated using MC: i.e. discounted reward sum (from trajectory) - value (from NN) """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) advantages = value_targets - values return advantages, value_loss def compute_adv_td(self, state, next_state, reward): """ Compute the advantage of a single (s,a) using TD: i.e. r + v(s') - v(s) - depends highly on the accuracy of NN """ state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) reward = torch.as_tensor(reward) state_value = self.value_network.forward(state) next_state_value = self.value_network.forward(next_state) value_target = reward + next_state_value advantage = value_target - state_value value_loss = F.mse_loss(state_value, value_target) return advantage, value_loss def compute_policy_loss_kl(self, state, state_adv, beta): """ Policy loss of DR TRPO (KL Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) denom = torch.sum(torch.exp(state_adv / beta) * pi_dist) new_pi_dist = torch.exp(state_adv / beta) * pi_dist / denom return F.mse_loss(pi_dist, new_pi_dist) def compute_policy_loss_wass(self, state, state_adv, beta): """ Policy loss of DR TRPO (Wasserstein Constraint). """ state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) pi_dist = logits state_adv = torch.FloatTensor(state_adv).to(self.device) """Find argmax_j {A(s,aj) - β*d(aj,ai)}.""" best_j = [] for i in range(self.action_dim): opt_j = 0 opt_val = state_adv[opt_j] - beta * self.compute_distance(opt_j, i) for j in range(self.action_dim): cur_val = state_adv[j] - beta * self.compute_distance(j, i) if cur_val > opt_val: opt_j = j opt_val = cur_val best_j.append(opt_j) new_pi_dist = torch.zeros(self.action_dim) for j in range(self.action_dim): for i in range(self.action_dim): if j == best_j[i]: new_pi_dist[j] += pi_dist[i] return F.mse_loss(pi_dist, new_pi_dist) def compute_distance(self, a1, a2): if a1 == a2: return 0 else: return 1 def update(self, value_loss, policy_loss): self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = F.softmax(logits, dim=0) probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory): states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = rewards.view( -1, 1) + torch.FloatTensor(discounted_rewards).view(-1, 1).to( self.device) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = F.softmax(logits, dim=1) probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() advantage = value_targets - values policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantage.detach() policy_loss = policy_loss.mean() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory): value_loss, policy_loss = self.compute_loss(trajectory) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class A2CAgent(): def __init__(self, env, gamma, lr): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.env = env self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.n self.gamma = gamma self.lr = lr self.value_network = ValueNetwork(self.obs_dim, 1) self.policy_network = PolicyNetwork(self.obs_dim, self.action_dim) self.value_optimizer = optim.Adam(self.value_network.parameters(), lr=self.lr) self.policy_optimizer = optim.Adam(self.policy_network.parameters(), lr=self.lr) def get_action(self, state): state = torch.FloatTensor(state).to(self.device) logits = self.policy_network.forward(state) dist = logits probs = Categorical(dist) return probs.sample().cpu().detach().item() def compute_loss(self, trajectory, adv_method): """ When gamma is large, the NN loss does not converge, we should use MC to estimate advantage. When gamma is small (i.e. 0.9), the NN loss decreases after training, we can use TD to estimate advantage. """ states = torch.FloatTensor([sars[0] for sars in trajectory]).to(self.device) actions = torch.LongTensor([sars[1] for sars in trajectory ]).view(-1, 1).to(self.device) rewards = torch.FloatTensor([sars[2] for sars in trajectory]).to(self.device) next_states = torch.FloatTensor([sars[3] for sars in trajectory ]).to(self.device) dones = torch.FloatTensor([sars[4] for sars in trajectory ]).view(-1, 1).to(self.device) # compute value target discounted_rewards = [torch.sum(torch.FloatTensor([self.gamma**i for i in range(rewards[j:].size(0))])\ * rewards[j:]) for j in range(rewards.size(0))] # sorry, not the most readable code. value_targets = torch.FloatTensor(discounted_rewards).view(-1, 1) # compute value loss values = self.value_network.forward(states) value_loss = F.mse_loss(values, value_targets.detach()) # compute policy loss with entropy bonus logits = self.policy_network.forward(states) dists = logits probs = Categorical(dists) # compute entropy bonus entropy = [] for dist in dists: entropy.append(-torch.sum(dist.mean() * torch.log(dist))) entropy = torch.stack(entropy).sum() # 0 for MC, 1 for TD if adv_method == 0: advantages = value_targets - values if adv_method == 1: advantages = rewards - values + self.gamma * torch.cat( (values[1:], torch.FloatTensor([[0]])), dim=0) policy_loss = -probs.log_prob(actions.view(actions.size(0))).view( -1, 1) * advantages.detach() policy_loss = policy_loss.sum() - 0.001 * entropy return value_loss, policy_loss def update(self, trajectory, adv_method): value_loss, policy_loss = self.compute_loss(trajectory, adv_method) self.value_optimizer.zero_grad() value_loss.backward() self.value_optimizer.step() self.policy_optimizer.zero_grad() policy_loss.backward() self.policy_optimizer.step()
class SACAgent: def __init__(self, env, gamma, tau, v_lr, q_lr, policy_lr, buffer_maxlen): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.firsttime = 0 self.env = env self.action_range = [env.action_space.low, env.action_space.high] #self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] #1 self.conv_channels = 4 self.kernel_size = (3, 3) self.img_size = (500, 500, 3) print("Diagnostics:") print(f"action_range: {self.action_range}") #print(f"obs_dim: {self.obs_dim}") print(f"action_dim: {self.action_dim}") # hyperparameters self.gamma = gamma self.tau = tau self.update_step = 0 self.delay_step = 2 # initialize networks self.feature_net = FeatureExtractor(self.img_size[2], self.conv_channels, self.kernel_size).to(self.device) print("Feature net init'd successfully") input_dim = self.feature_net.get_output_size(self.img_size) self.input_size = input_dim[0] * input_dim[1] * input_dim[2] print(f"input_size: {self.input_size}") self.value_net = ValueNetwork(self.input_size, 1).to(self.device) self.target_value_net = ValueNetwork(self.input_size, 1).to(self.device) self.q_net1 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.q_net2 = SoftQNetwork(self.input_size, self.action_dim).to(self.device) self.policy_net = PolicyNetwork(self.input_size, self.action_dim).to(self.device) print("Finished initing all nets") # copy params to target param for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(param) print("Finished copying targets") # initialize optimizers self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=v_lr) self.q1_optimizer = optim.Adam(self.q_net1.parameters(), lr=q_lr) self.q2_optimizer = optim.Adam(self.q_net2.parameters(), lr=q_lr) self.policy_optimizer = optim.Adam(self.policy_net.parameters(), lr=policy_lr) print("Finished initing optimizers") self.replay_buffer = BasicBuffer(buffer_maxlen) print("End of init") def get_action(self, state): if state.shape != self.img_size: print( f"Invalid size, expected shape {self.img_size}, got {state.shape}" ) return None inp = torch.from_numpy(state).float().permute(2, 0, 1).unsqueeze(0).to( self.device) features = self.feature_net(inp) features = features.view(-1, self.input_size) mean, log_std = self.policy_net.forward(features) std = log_std.exp() normal = Normal(mean, std) z = normal.sample() action = torch.tanh(z) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def update(self, batch_size): states, actions, rewards, next_states, dones = self.replay_buffer.sample( batch_size) # states and next states are lists of ndarrays, np.stack converts them to # ndarrays of shape (batch_size, height, width, num_channels) states = np.stack(states) next_states = np.stack(next_states) states = torch.FloatTensor(states).permute(0, 3, 1, 2).to(self.device) actions = torch.FloatTensor(actions).to(self.device) rewards = torch.FloatTensor(rewards).to(self.device) next_states = torch.FloatTensor(next_states).permute(0, 3, 1, 2).to(self.device) dones = torch.FloatTensor(dones).to(self.device) dones = dones.view(dones.size(0), -1) # Process images features = self.feature_net( states) #.contiguous() # Properly shaped due to batching next_features = self.feature_net(next_states) #.contiguous() features = torch.reshape(features, (64, self.input_size)) next_features = torch.reshape(next_features, (64, self.input_size)) next_actions, next_log_pi = self.policy_net.sample(next_features) next_q1 = self.q_net1(next_features, next_actions) next_q2 = self.q_net2(next_features, next_actions) next_v = self.target_value_net(next_features) next_v_target = torch.min(next_q1, next_q2) - next_log_pi curr_v = self.value_net.forward(features) v_loss = F.mse_loss(curr_v, next_v_target.detach()) # q loss expected_q = rewards + (1 - dones) * self.gamma * next_v curr_q1 = self.q_net1.forward(features, actions) curr_q2 = self.q_net2.forward(features, actions) q1_loss = F.mse_loss(curr_q1, expected_q.detach()) q2_loss = F.mse_loss(curr_q2, expected_q.detach()) # update value and q networks self.value_optimizer.zero_grad() v_loss.backward(retain_graph=True) self.value_optimizer.step() self.q1_optimizer.zero_grad() q1_loss.backward(retain_graph=True) self.q1_optimizer.step() self.q2_optimizer.zero_grad() q2_loss.backward(retain_graph=True) self.q2_optimizer.step() # delayed update for policy network and target q networks if self.update_step % self.delay_step == 0: new_actions, log_pi = self.policy_net.sample(features) min_q = torch.min(self.q_net1.forward(features, new_actions), self.q_net2.forward(features, new_actions)) policy_loss = (log_pi - min_q).mean() self.policy_optimizer.zero_grad() policy_loss.backward(retain_graph=True) self.policy_optimizer.step() # target networks for target_param, param in zip(self.target_value_net.parameters(), self.value_net.parameters()): target_param.data.copy_(self.tau * param + (1 - self.tau) * target_param) self.update_step += 1