Exemple #1
0
    def online_policy_update(self, board, legal_moves, logprob):
        """ Not Tested after PyTorch update"""
        new_value = self.model(config.make_variable([board]), config.make_variable([legal_moves]))[1].data[0, 0]
        reward = self.state_values[-1] - new_value
        loss = -logprob * reward

        self.optimizer.zero_grad()
        loss.backward(retain_graph=True)
        self.optimizer.step()
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])

        probs, state_value = self.model(input,
                                        config.make_variable(legal_moves_map))
        distribution = Categorical(probs)
        action = distribution.sample()

        move = (int(action) // config.BOARD_SIZE,
                int(action) % config.BOARD_SIZE)
        if self.train:
            self.log_probs.append(distribution.log_prob(action))
        return move
Exemple #3
0
    def update(self):
        # ---------------------- Error Logging ---------------------- #
        if not self.train:
            return 0

        if len(self.log_probs) != len(self.rewards) or len(self.log_probs) != len(self.state_values):
            raise PlayerException("log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len(self.rewards), len(self.state_values)))

        rewards = self.bootstrap_rewards()
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)

        if self.online:
            loss = self.calculate_online_loss(self.state_values, rewards)
        else:
            loss = self.calculate_loss(self.log_probs, self.state_values, rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]
        del self.state_values[:]
        del self.board_samples[:]
        del self.legal_moves[:]

        return abs(float(loss))
    def update(self):
        if not self.train:
            return 0

        if len(self.log_probs) != len(self.rewards):
            raise PlayerException(
                "log_probs length must be equal to rewards length. Got %s - %s"
                % (len(self.log_probs), len(self.rewards)))

        rewards = self.discount_rewards(self.rewards, self.gamma)
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)  # For now nothing to normalize, standard deviation = 0

        policy_losses = [(-log_prob * reward)
                         for log_prob, reward in zip(self.log_probs, rewards)]

        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_losses).sum()
        policy_loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]

        return abs(float(policy_loss))
Exemple #5
0
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])

        probs, state_value = self.model(input, config.make_variable(legal_moves_map))
        distribution = Categorical(probs)
        action = distribution.sample()
        log_prob = distribution.log_prob(action)

        move = (int(action) // config.BOARD_SIZE, int(action) % config.BOARD_SIZE)
        if self.train:
            if self.online:
                self.online_policy_update(board_sample, legal_moves_map, log_prob)

            self.log_probs.append(log_prob)
            self.state_values.append(state_value[0])
            self.board_samples.append(board_sample)
            self.legal_moves.append(legal_moves_map)
        return move
    def evaluate(self, board_sample, legal_moves_map):
        input = config.make_variable([board_sample])

        try:
            probs, state_value = self.model(
                input, config.make_variable(legal_moves_map))
            distribution = Categorical(probs)
            action = distribution.sample()
        except RuntimeError:
            print("Invalid distribution. Board sample: \n%s" % board_sample)
            probs, state_value = self.model(
                input, config.make_variable(legal_moves_map))
            distribution = Categorical(probs)
            action = distribution.sample()

        move = (int(action) // config.BOARD_SIZE,
                int(action) % config.BOARD_SIZE)
        if self.train:
            self.log_probs.append(distribution.log_prob(action))
            self.state_values.append(state_value[0])
            self.board_samples.append(board_sample)
            self.legal_moves.append(legal_moves_map)
        return move