class PPO(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.betas = (0.9, 0.999) self.gamma = 0.99 self.eps_clip = 0.2 self.buffer_size = 2048 self.batch_size = 256 self.K_epochs = 3 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=True) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents print("Nº of Agents: ", self.n_agents) # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate, betas=self.betas) self.MseLoss = nn.MSELoss() # Buffer memory self.memory = [] for _ in range(self.n_agents): self.memory.append(Buffer()) # Initialize time step (for updating when buffer_size is full) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): action = [] logprobs = [] value = [] # Action of agent for i in range(self.n_agents): a, b, c = self.act(state[i]) action.append(a) logprobs.append(b) value.append(c) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Done done_ = [] for i in range(self.n_agents): done_.append(1 - done[i]) # Agent step for i in range(self.n_agents): self.step(state[i], action[i], reward[i], next_state[i], done_[i], logprobs[i], value[i], self.memory[i]) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score self.data.update_score(reward, value, done, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) log_probs = F.log_softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) log_probs = log_probs.gather(1, action) return action, log_probs, value def step(self, state, action, reward, next_state, done, logprobs, value, memory): # Update model when buffer_size is full if memory.len_() == (self.buffer_size / self.n_agents): self.learn() for i in range(self.n_agents): self.memory[i].reset() # Save experience in buffer memory memory.add(state, action, reward, next_state, done, logprobs, value) def evaluate(self, states, next_states, actions, rewards, masks, compute_gae): logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) values_ = values _, value = self.model(next_states) values = torch.cat((values, value.data)) returns = [] if (compute_gae): gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * masks[i] * values[ i + 1].data - values[i].data gae = gae * self.gamma * self.tau * masks[i] + delta_t returns.insert(0, gae + values[i]) return log_probs, values_, entropies, returns def compute_returns(self): returns_ = [] for i in range(self.n_agents): # Get Experiences (of each agent) experiences = self.memory[i].get() states, actions, rewards, next_states, dones, logprobs_, values_ = experiences # Evaluate _, _, _, r = self.evaluate(states, next_states, actions, rewards, dones, compute_gae=True) returns_.append(r) l = [] for i in range(len(returns_)): for j in range(len(returns_[0])): l.append(returns_[i][j]) return l def learn(self): # Get Experiences states, actions, rewards, next_states, dones, logprobs_, values_ = self.getExp( ) returns_eval = self.compute_returns() returns_eval = torch.tensor(returns_eval).to(device) returns_eval = returns_eval.unsqueeze(1) # Optimize policy for K epochs: for _ in range(self.K_epochs): # List with all indices l = np.arange(self.buffer_size) l = list(l) x = self.buffer_size // self.batch_size for _ in range(x): # Take a random batch indices = random.sample(l, self.batch_size) old_logprobs = torch.empty(self.batch_size, 1) old_values = torch.empty(self.batch_size, 1) old_actions = torch.empty(self.batch_size) old_states = torch.empty(self.batch_size, self.state_size) old_next_states = torch.empty(self.batch_size, self.state_size) old_rewards = np.zeros(self.batch_size) returns = torch.empty(self.batch_size, 1) for i in range(len(indices)): old_logprobs[i] = logprobs_[indices[i]] old_values[i] = values_[indices[i]] old_actions[i] = actions[indices[i]] old_states[i] = states[indices[i]] old_next_states[i] = next_states[indices[i]] old_rewards[i] = rewards[indices[i]] returns[i] = returns_eval[indices[i]] old_actions = old_actions.long() # Remove indices to not repeat for i in indices: l.remove(i) # Evaluate logprobs, state_values, dist_entropy, _ = self.evaluate( old_states, old_next_states, old_actions, rewards, dones, compute_gae=False) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs) # Finding Surrogate Loss: advantages = returns - old_values surr1 = ratios * advantages surr2 = torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) * advantages # LOSS = ACTOR LOSS + CRITIC_DISCOUNT * CRITIC_LOSS - ENTROPY_BETA * ENTROPY loss = -torch.min( surr1, surr2) + self.value_loss_coef * self.MseLoss( state_values, returns) - self.entropy_coef * dist_entropy # Optimizer step self.optimizerStep(self.optimizer, loss.mean()) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step() def getExp(self): states, actions, rewards, next_states, dones, logprobs, values = [], [], [], [], [], [], [] for i in range(self.n_agents): experiences = self.memory[i].get() states.append(experiences[0]) actions.append(experiences[1]) rewards.append(experiences[2]) next_states.append(experiences[3]) dones.append(experiences[4]) logprobs.append(experiences[5]) values.append(experiences[6]) states_, actions_, rewards_, next_states_, dones_, logprobs_, values_ = [], [], [], [], [], [], [] for i in range(len(states)): for j in range(len(states[0])): states_.append(states[i][j]) actions_.append(actions[i][j]) rewards_.append(rewards[i][j]) next_states_.append(next_states[i][j]) dones_.append(dones[i][j]) logprobs_.append(logprobs[i][j]) values_.append(values[i][j]) states__ = torch.empty(self.buffer_size, self.state_size) actions__ = torch.empty(self.buffer_size) next_states__ = torch.empty(self.buffer_size, self.state_size) dones__ = torch.empty(self.buffer_size) logprobs__ = torch.empty(self.buffer_size, 1, 1) values__ = torch.empty(self.buffer_size) for i in range(self.buffer_size): states__[i] = states_[i] actions__[i] = actions_[i] next_states__[i] = next_states_[i] dones__[i] = dones_[i] logprobs__[i] = logprobs_[i] values__[i] = values_[i] return states__, actions__, rewards_, next_states__, dones__, logprobs__, values__
class AC(): def __init__(self): # Hyperparameters self.learning_rate = 0.0003 self.gamma = 0.99 self.batch_size = 256 self.max_steps = 100000 self.tau = 0.95 self.entropy_coef = 0.001 self.value_loss_coef = 0.5 self.summary_freq = 1000 # Environment self.env_name = "Environments/env1/Unity Environment" channel = EngineConfigurationChannel() self.env = UnityEnv(self.env_name, worker_id=0, use_visual=False, side_channels=[channel], no_graphics=False, multiagent=False) channel.set_configuration_parameters(time_scale=100) self.action_size, self.state_size = Utils.getActionStateSize(self.env) self.n_agents = self.env.number_agents # Model self.model = ActorCritic(self.state_size, self.action_size, seed=0).to(device) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # Buffer memory self.memory = Buffer() # Initialize time step (for updating every "batch_size" time steps) self.t_step = 1 def train(self): # Initial observation env_info = self.env.reset() state = env_info # Data self.data = Data(self.n_agents, self.summary_freq) # Training loop for _ in range(self.max_steps): # Action of agent action, value = self.act(state) # Send the action to the environment next_state, reward, done, info = self.env.step(action) # Agent step self.step(state, action, reward, next_state, done) # Update t_step self.t_step += 1 # Next state state = next_state # Update the score reward_ = np.expand_dims(reward, axis=0) value_ = value.unsqueeze(0) done_ = np.expand_dims(done, axis=0) self.data.update_score(reward_, value_, done_, self.t_step) # Summary if self.t_step % self.summary_freq == 0: self.data.summary(self.t_step) # Save self.save() def save(self): torch.save(self.model.state_dict(), 'Saved Models/model.pth') self.data.results() def load_model(self, model): self.model.load_state_dict(torch.load(model)) def act(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) # Get actions probabilities and value from ActorCritic model self.model.eval() with torch.no_grad(): action_probs, value = self.model(state) self.model.train() prob = F.softmax(action_probs, -1) # Get action and log of probabilities action = prob.multinomial(num_samples=1) return action, value def step(self, state, action, reward, next_state, done): # Save experience in buffer memory self.memory.add(state, action, reward, next_state, done) # Learn every "batch_size" time steps if self.t_step % self.batch_size == 0: experiences = self.memory.get() self.learn(experiences) self.memory.reset() def learn(self, experiences): # Get Experiences states, actions, rewards, next_states = experiences logits, values = self.model(states) probs = F.softmax(logits, -1) log_probs = F.log_softmax(logits, -1) entropies = -(log_probs * probs).sum(1, keepdim=True) log_probs = log_probs.gather(1, actions.unsqueeze(1)) _, value = self.model(next_states) values = torch.cat((values, value.data)) policy_loss = 0 value_loss = 0 R = values[-1] gae = torch.zeros(1, 1) for i in reversed(range(len(rewards))): R = self.gamma * R + rewards[i] advantage = R - values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # Generalized Advantage Estimation delta_t = rewards[i] + self.gamma * values[i + 1].data - values[i].data gae = gae * self.gamma * self.tau + delta_t policy_loss = policy_loss - (log_probs[i] * gae) - ( self.entropy_coef * entropies[i]) # Loss loss = (policy_loss + self.value_loss_coef * value_loss) # Optimizer step self.optimizerStep(self.optimizer, loss) def optimizerStep(self, optimizer, loss): optimizer.zero_grad() loss.backward() optimizer.step()