Esempio n. 1
0
    def update(self):
        # ---------------------- Error Logging ---------------------- #
        if not self.train:
            return 0

        if len(self.log_probs) != len(self.rewards) or len(self.log_probs) != len(self.state_values):
            raise PlayerException("log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len(self.rewards), len(self.state_values)))

        rewards = self.bootstrap_rewards()
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)

        if self.online:
            loss = self.calculate_online_loss(self.state_values, rewards)
        else:
            loss = self.calculate_loss(self.log_probs, self.state_values, rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]
        del self.state_values[:]
        del self.board_samples[:]
        del self.legal_moves[:]

        return abs(float(loss))
    def update(self):
        if not self.train:
            return 0

        if len(self.log_probs) != len(self.rewards):
            raise PlayerException(
                "log_probs length must be equal to rewards length. Got %s - %s"
                % (len(self.log_probs), len(self.rewards)))

        rewards = self.discount_rewards(self.rewards, self.gamma)
        rewards = config.make_variable(rewards)
        # rewards = self.normalize_rewards(rewards)  # For now nothing to normalize, standard deviation = 0

        policy_losses = [(-log_prob * reward)
                         for log_prob, reward in zip(self.log_probs, rewards)]

        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_losses).sum()
        policy_loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]

        return abs(float(policy_loss))
Esempio n. 3
0
    def test_LegalSoftMax(self):
        def transform(x):
            return [Variable(torch.DoubleTensor((x*3)).view(-1, 9))]

        edge_cases = transform([0.2, 0.3, 0.8])
        edge_cases += transform([-0.2, -0.3, -0.8])
        edge_cases += transform([0.2, -0.3, -0.8])

        edge_cases += transform([20000.3, 30000.3, 80000.3])
        edge_cases += transform([-20000.3, 30000.3, -80000.3])
        edge_cases += transform([20000.3, -30000.3, -80000.3])

        legal_moves = transform([1, 1, 1])
        legal_moves += transform([1, 1, 0])
        legal_moves += transform([1, 0, 0])

        for i, case in enumerate(edge_cases):
            for j, l_moves in enumerate(legal_moves):
                try:
                    x = Model.legal_softmax_functional(case, l_moves)
                except Exception as e:
                    raise PlayerException("LegalSoftMax failed for edge case %s and legal move %s: \n    %s" % (i, j, e))

                self.assertTrue((x == x*l_moves).all(), "LegalSoftMax did not set illegal moves to 0")
                self.assertTrue(x.sum().data[0] > 0, "x.sum <= 0 for edge case %s and legal move %s" % (i, j))
                for elem in x.data.tolist():
                    self.assertNotEqual(elem, np.nan)
    def update(self):
        # ---------------------- Error Logging ---------------------- #
        if not self.train:
            return None

        if len(self.log_probs) != len(self.rewards) or len(
                self.log_probs) != len(self.state_values):
            raise PlayerException(
                "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s"
                % (len(self.log_probs), len(
                    self.rewards), len(self.state_values)))

        # ----------------------------------------------------------- #

        rewards = self.discount_rewards(self.rewards, self.gamma)
        rewards = self.rewards_baseline(rewards)
        rewards = config.make_variable(rewards)

        loss = self.calculate_loss(self.log_probs, self.state_values, rewards)

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        del self.rewards[:]
        del self.log_probs[:]
        del self.state_values[:]
        del self.board_samples[:]
        del self.legal_moves[:]

        return abs(loss.data)
Esempio n. 5
0
class ExperiencedPlayer(Player):
    """ Wins games or blocks opponent with the next move. Uses Heuristic Table if there are no winning or blocking moves"""
    if config.BOARD_SIZE == 3:
        heuristic_table = np.array([[1, 0.5, 1], [0.5, 0.75, 0.5], [1, 0.5,
                                                                    1]])
    elif config.BOARD_SIZE == 8:
        heuristic_table = np.array([[1, 2, 3, 4, 4, 3, 2, 1],
                                    [2, 3, 4, 5, 5, 4, 3, 2],
                                    [3, 4, 5, 6, 6, 5, 4, 3],
                                    [4, 5, 6, 7, 7, 6, 5, 4],
                                    [4, 5, 6, 7, 7, 6, 5, 4],
                                    [3, 4, 5, 6, 6, 5, 4, 3],
                                    [2, 3, 4, 5, 5, 4, 3, 2],
                                    [1, 2, 3, 4, 4, 3, 2, 1]])
    else:
        raise PlayerException(
            "HeuristicPlayer is not implemented for board size == %s" %
            config.BOARD_SIZE)

    def __init__(self, deterministic=True, block_mid=False):
        self.deterministic = deterministic
        self.block_mid = block_mid

    def get_move(self, board):
        valid_moves = board.get_valid_moves(self.color)

        if self.block_mid and sum(
                board.count_stones()) == 1 and (1, 1) in valid_moves:
            return 1, 1

        denies, attacks = [], []
        for move in valid_moves:
            afterstate = board.copy().apply_move(move, self.color)
            if afterstate.game_won() == self.color:
                return move

            afterstate_opponent = board.copy().apply_move(
                move, board.other_color(self.color))
            if afterstate_opponent.game_won() == board.other_color(self.color):
                denies.append(
                    (self.evaluate_heuristic_table(afterstate_opponent), move))

            attacks.append((self.evaluate_heuristic_table(afterstate), move))

        if denies:
            return max(denies)[1]
        else:
            return max(attacks)[1]

    def evaluate_heuristic_table(self, board):
        self_mask = board.board == self.color
        other_mask = board.board == board.other_color(self.color)
        score = np.sum(self.heuristic_table * self_mask -
                       self.heuristic_table * other_mask)
        if not self.deterministic:
            score += random(
            ) * 0.001  # Bring some randomness to equaly valued boards
        return score