def play(self, env, agent_a, agent_b): agent_a = th.cuda(agent_a) agent_b = th.cuda(agent_b) optimizer = optim.Adam( list(agent_a.parameters()) + list(agent_b.parameters())) for i in range(self.max_epochs): optimizer.zero_grad() color_codes, colors = env.mini_batch(batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) loss = self.communication_channel(env, agent_a, agent_b, color_codes, colors) loss.backward() optimizer.step() # printing status if self.print_interval != 0 and ((i + 1) % self.print_interval == 0): self.print_status(loss) if self.evaluate_interval != 0 and ((i + 1) % self.evaluate_interval == 0): self.evaluate(env, agent_a) return agent_a.cpu()
def play(self, env, agent_a, agent_b): agent_a = th.cuda(agent_a) agent_b = th.cuda(agent_b) receiver_opt = optim.Adam(list(agent_b.parameters())) optimizer = optim.Adam( list(agent_a.parameters()) + list(agent_b.parameters())) for i in range(self.max_epochs): for j in range(50): color_codes, colors = env.mini_batch( batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) receiver_loss, _, _ = self.communication_channel( env, agent_a, agent_b, color_codes, colors) receiver_loss.backward() receiver_opt.step() receiver_opt.zero_grad() self.board_reward = 0 optimizer.zero_grad() color_codes, colors = env.mini_batch(batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) receiver_loss, sender_loss, entropy_loss = self.communication_channel( env, agent_a, agent_b, color_codes, colors) loss = receiver_loss + sender_loss + entropy_loss loss.backward() optimizer.step() # Update tensorboard #print(self.tensorboard) if ((i + 1) % self.print_interval == 0): self.tensorboard_update(i, env, agent_a, agent_b) # printing status if self.print_interval != 0 and ((i + 1) % self.print_interval == 0): if self.loss_type == 'REINFORCE': #self.print_status(-loss) self.print_status(loss) # else: #self.print_status(loss) if self.evaluate_interval != 0 and ((i + 1) % self.evaluate_interval == 0): self.evaluate(env, agent_a)
def agent_language_map(env, a): V = {} a = th.cuda(a) perception_indices, perceptions = env.full_batch() probs = a(perception=perceptions) _, terms = probs.max(1) for perception_index in perception_indices: V[perception_index] = terms[perception_index].item() return list(V.values())
def play(self, env, agent_a, agent_b): agent_a = th.cuda(agent_a) agent_b = th.cuda(agent_b) optimizer = optim.Adam(list(agent_a.parameters()) + list(agent_b.parameters()), lr=0.0001) for i in range(self.max_epochs): optimizer.zero_grad() color_codes, colors = env.mini_batch(batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) loss = self.communication_channel(env, agent_a, agent_b, color_codes, colors) loss.backward() optimizer.step() # Update tensorboard #print(self.tensorboard) # if((i+1) % self.print_interval == 0): # self.tensorboard_update(i, env, agent_a, agent_b) # printing status if self.print_interval != 0 and ((i + 1) % self.print_interval == 0): if self.loss_type == 'REINFORCE': #self.print_status(-loss) self.print_status(loss) else: self.print_status(loss) if self.evaluate_interval != 0 and ((i + 1) % self.evaluate_interval == 0): self.evaluate(env, agent_a) #agent_a.reward_log = self.reward_log #agent_b.reward_log = self.reward_log return agent_a.cpu()
def play(self, env, agent_a, agent_b): agent_a = th.cuda(agent_a) agent_b = th.cuda(agent_b) optimizer = optim.Adam(list(agent_a.parameters()) + list(agent_b.parameters()), lr=0.0001) for i in range(self.max_epochs): optimizer.zero_grad() # Agent a sends a message color_codes, colors = env.mini_batch(batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) loss1 = self.communication_channel(env, agent_a, agent_b, color_codes, colors) loss1.backward() # Agent b sends a message color_codes, colors = env.mini_batch(batch_size=self.batch_size) color_codes = th.long_var(color_codes) colors = th.float_var(colors) loss2 = self.communication_channel(env, agent_b, agent_a, color_codes, colors) loss2.backward() # Backprogate #loss.backward() optimizer.step() loss = loss1 + loss2 # printing status if self.print_interval != 0 and ((i + 1) % self.print_interval == 0): #self.tensorboard_update(i, env, agent_a) self.print_status(loss) if self.evaluate_interval != 0 and ((i + 1) % self.evaluate_interval == 0): self.evaluate(env, agent_a) return agent_a.cpu()
def agent_language_map(env, a): V = {} a = th.cuda(a) perception_indices, perceptions = env.full_batch() if isinstance(perceptions, np.ndarray): perceptions = th.float_var( torch.tensor(perceptions, dtype=torch.float32)) probs = a(perception=perceptions) _, terms = probs.max(1) for perception_index in perception_indices: V[perception_index] = terms[perception_index].item() return list(V.values())