def update(self): # ---------------------- Error Logging ---------------------- # if not self.train: return 0 if len(self.log_probs) != len(self.rewards) or len(self.log_probs) != len(self.state_values): raise PlayerException("log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len(self.rewards), len(self.state_values))) rewards = self.bootstrap_rewards() rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) if self.online: loss = self.calculate_online_loss(self.state_values, rewards) else: loss = self.calculate_loss(self.log_probs, self.state_values, rewards) self.optimizer.zero_grad() loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] del self.state_values[:] del self.board_samples[:] del self.legal_moves[:] return abs(float(loss))
def update(self): if not self.train: return 0 if len(self.log_probs) != len(self.rewards): raise PlayerException( "log_probs length must be equal to rewards length. Got %s - %s" % (len(self.log_probs), len(self.rewards))) rewards = self.discount_rewards(self.rewards, self.gamma) rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) # For now nothing to normalize, standard deviation = 0 policy_losses = [(-log_prob * reward) for log_prob, reward in zip(self.log_probs, rewards)] self.optimizer.zero_grad() policy_loss = torch.cat(policy_losses).sum() policy_loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] return abs(float(policy_loss))
def test_LegalSoftMax(self): def transform(x): return [Variable(torch.DoubleTensor((x*3)).view(-1, 9))] edge_cases = transform([0.2, 0.3, 0.8]) edge_cases += transform([-0.2, -0.3, -0.8]) edge_cases += transform([0.2, -0.3, -0.8]) edge_cases += transform([20000.3, 30000.3, 80000.3]) edge_cases += transform([-20000.3, 30000.3, -80000.3]) edge_cases += transform([20000.3, -30000.3, -80000.3]) legal_moves = transform([1, 1, 1]) legal_moves += transform([1, 1, 0]) legal_moves += transform([1, 0, 0]) for i, case in enumerate(edge_cases): for j, l_moves in enumerate(legal_moves): try: x = Model.legal_softmax_functional(case, l_moves) except Exception as e: raise PlayerException("LegalSoftMax failed for edge case %s and legal move %s: \n %s" % (i, j, e)) self.assertTrue((x == x*l_moves).all(), "LegalSoftMax did not set illegal moves to 0") self.assertTrue(x.sum().data[0] > 0, "x.sum <= 0 for edge case %s and legal move %s" % (i, j)) for elem in x.data.tolist(): self.assertNotEqual(elem, np.nan)
def update(self): # ---------------------- Error Logging ---------------------- # if not self.train: return None if len(self.log_probs) != len(self.rewards) or len( self.log_probs) != len(self.state_values): raise PlayerException( "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len( self.rewards), len(self.state_values))) # ----------------------------------------------------------- # rewards = self.discount_rewards(self.rewards, self.gamma) rewards = self.rewards_baseline(rewards) rewards = config.make_variable(rewards) loss = self.calculate_loss(self.log_probs, self.state_values, rewards) self.optimizer.zero_grad() loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] del self.state_values[:] del self.board_samples[:] del self.legal_moves[:] return abs(loss.data)
class ExperiencedPlayer(Player): """ Wins games or blocks opponent with the next move. Uses Heuristic Table if there are no winning or blocking moves""" if config.BOARD_SIZE == 3: heuristic_table = np.array([[1, 0.5, 1], [0.5, 0.75, 0.5], [1, 0.5, 1]]) elif config.BOARD_SIZE == 8: heuristic_table = np.array([[1, 2, 3, 4, 4, 3, 2, 1], [2, 3, 4, 5, 5, 4, 3, 2], [3, 4, 5, 6, 6, 5, 4, 3], [4, 5, 6, 7, 7, 6, 5, 4], [4, 5, 6, 7, 7, 6, 5, 4], [3, 4, 5, 6, 6, 5, 4, 3], [2, 3, 4, 5, 5, 4, 3, 2], [1, 2, 3, 4, 4, 3, 2, 1]]) else: raise PlayerException( "HeuristicPlayer is not implemented for board size == %s" % config.BOARD_SIZE) def __init__(self, deterministic=True, block_mid=False): self.deterministic = deterministic self.block_mid = block_mid def get_move(self, board): valid_moves = board.get_valid_moves(self.color) if self.block_mid and sum( board.count_stones()) == 1 and (1, 1) in valid_moves: return 1, 1 denies, attacks = [], [] for move in valid_moves: afterstate = board.copy().apply_move(move, self.color) if afterstate.game_won() == self.color: return move afterstate_opponent = board.copy().apply_move( move, board.other_color(self.color)) if afterstate_opponent.game_won() == board.other_color(self.color): denies.append( (self.evaluate_heuristic_table(afterstate_opponent), move)) attacks.append((self.evaluate_heuristic_table(afterstate), move)) if denies: return max(denies)[1] else: return max(attacks)[1] def evaluate_heuristic_table(self, board): self_mask = board.board == self.color other_mask = board.board == board.other_color(self.color) score = np.sum(self.heuristic_table * self_mask - self.heuristic_table * other_mask) if not self.deterministic: score += random( ) * 0.001 # Bring some randomness to equaly valued boards return score