Exemple #1
0
    def play(self, env, agent_a, agent_b):
        agent_a = th.cuda(agent_a)
        agent_b = th.cuda(agent_b)

        optimizer = optim.Adam(
            list(agent_a.parameters()) + list(agent_b.parameters()))

        for i in range(self.max_epochs):
            optimizer.zero_grad()

            color_codes, colors = env.mini_batch(batch_size=self.batch_size)
            color_codes = th.long_var(color_codes)
            colors = th.float_var(colors)

            loss = self.communication_channel(env, agent_a, agent_b,
                                              color_codes, colors)

            loss.backward()
            optimizer.step()

            # printing status
            if self.print_interval != 0 and ((i + 1) % self.print_interval
                                             == 0):
                self.print_status(loss)

            if self.evaluate_interval != 0 and ((i + 1) %
                                                self.evaluate_interval == 0):
                self.evaluate(env, agent_a)

        return agent_a.cpu()
    def play(self, env, agent_a, agent_b):
        agent_a = th.cuda(agent_a)
        agent_b = th.cuda(agent_b)
        receiver_opt = optim.Adam(list(agent_b.parameters()))
        optimizer = optim.Adam(
            list(agent_a.parameters()) + list(agent_b.parameters()))

        for i in range(self.max_epochs):
            for j in range(50):
                color_codes, colors = env.mini_batch(
                    batch_size=self.batch_size)
                color_codes = th.long_var(color_codes)
                colors = th.float_var(colors)
                receiver_loss, _, _ = self.communication_channel(
                    env, agent_a, agent_b, color_codes, colors)
                receiver_loss.backward()
                receiver_opt.step()
                receiver_opt.zero_grad()
            self.board_reward = 0
            optimizer.zero_grad()

            color_codes, colors = env.mini_batch(batch_size=self.batch_size)
            color_codes = th.long_var(color_codes)
            colors = th.float_var(colors)

            receiver_loss, sender_loss, entropy_loss = self.communication_channel(
                env, agent_a, agent_b, color_codes, colors)
            loss = receiver_loss + sender_loss + entropy_loss
            loss.backward()
            optimizer.step()

            # Update tensorboard
            #print(self.tensorboard)
            if ((i + 1) % self.print_interval == 0):
                self.tensorboard_update(i, env, agent_a, agent_b)
            # printing status
            if self.print_interval != 0 and ((i + 1) % self.print_interval
                                             == 0):
                if self.loss_type == 'REINFORCE':
                    #self.print_status(-loss)
                    self.print_status(loss)
        #        else:
        #self.print_status(loss)

            if self.evaluate_interval != 0 and ((i + 1) %
                                                self.evaluate_interval == 0):
                self.evaluate(env, agent_a)
Exemple #3
0
def agent_language_map(env, a):
    V = {}
    a = th.cuda(a)
    perception_indices, perceptions = env.full_batch()

    probs = a(perception=perceptions)
    _, terms = probs.max(1)

    for perception_index in perception_indices:
        V[perception_index] = terms[perception_index].item()

    return list(V.values())
    def play(self, env, agent_a, agent_b):
        agent_a = th.cuda(agent_a)
        agent_b = th.cuda(agent_b)
        optimizer = optim.Adam(list(agent_a.parameters()) +
                               list(agent_b.parameters()),
                               lr=0.0001)

        for i in range(self.max_epochs):
            optimizer.zero_grad()

            color_codes, colors = env.mini_batch(batch_size=self.batch_size)
            color_codes = th.long_var(color_codes)
            colors = th.float_var(colors)

            loss = self.communication_channel(env, agent_a, agent_b,
                                              color_codes, colors)
            loss.backward()
            optimizer.step()

            # Update tensorboard
            #print(self.tensorboard)
            # if((i+1) % self.print_interval == 0):
            #     self.tensorboard_update(i, env, agent_a, agent_b)
            # printing status
            if self.print_interval != 0 and ((i + 1) % self.print_interval
                                             == 0):
                if self.loss_type == 'REINFORCE':
                    #self.print_status(-loss)
                    self.print_status(loss)
                else:
                    self.print_status(loss)

            if self.evaluate_interval != 0 and ((i + 1) %
                                                self.evaluate_interval == 0):
                self.evaluate(env, agent_a)

        #agent_a.reward_log = self.reward_log
        #agent_b.reward_log = self.reward_log

        return agent_a.cpu()
    def play(self, env, agent_a, agent_b):
        agent_a = th.cuda(agent_a)
        agent_b = th.cuda(agent_b)

        optimizer = optim.Adam(list(agent_a.parameters()) +
                               list(agent_b.parameters()),
                               lr=0.0001)

        for i in range(self.max_epochs):
            optimizer.zero_grad()
            # Agent a sends a message
            color_codes, colors = env.mini_batch(batch_size=self.batch_size)
            color_codes = th.long_var(color_codes)
            colors = th.float_var(colors)
            loss1 = self.communication_channel(env, agent_a, agent_b,
                                               color_codes, colors)
            loss1.backward()
            # Agent b sends a message
            color_codes, colors = env.mini_batch(batch_size=self.batch_size)
            color_codes = th.long_var(color_codes)
            colors = th.float_var(colors)
            loss2 = self.communication_channel(env, agent_b, agent_a,
                                               color_codes, colors)
            loss2.backward()
            # Backprogate
            #loss.backward()
            optimizer.step()
            loss = loss1 + loss2
            # printing status
            if self.print_interval != 0 and ((i + 1) % self.print_interval
                                             == 0):
                #self.tensorboard_update(i, env, agent_a)
                self.print_status(loss)

            if self.evaluate_interval != 0 and ((i + 1) %
                                                self.evaluate_interval == 0):
                self.evaluate(env, agent_a)

        return agent_a.cpu()
def agent_language_map(env, a):
    V = {}
    a = th.cuda(a)
    perception_indices, perceptions = env.full_batch()
    if isinstance(perceptions, np.ndarray):
        perceptions = th.float_var(
            torch.tensor(perceptions, dtype=torch.float32))
    probs = a(perception=perceptions)
    _, terms = probs.max(1)

    for perception_index in perception_indices:
        V[perception_index] = terms[perception_index].item()

    return list(V.values())