def online_policy_update(self, board, legal_moves, logprob): new_value = self.model(config.make_variable([board]), config.make_variable([legal_moves]))[1].data[0, 0] reward = self.state_values[-1].data[0, 0] - new_value loss = -logprob * reward self.optimizer.zero_grad() loss.backward(retain_graph=True) self.optimizer.step()
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) legal_moves_map = config.make_variable(legal_moves_map) probs, _ = self.model(input, legal_moves_map) distribution = Categorical(probs) action = distribution.sample() move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE) log_prob = distribution.log_prob(action) if self.train: self.log_probs.append(log_prob) return move
def update(self): # ---------------------- Error Logging ---------------------- # if not self.train: return None if len(self.log_probs) != len(self.rewards) or len( self.log_probs) != len(self.state_values): raise PlayerException( "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len( self.rewards), len(self.state_values))) # ----------------------------------------------------------- # # Bootstrapping rewards = self.bootstrap_rewards() rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) # For now nothing to normalize, standard deviation = 0 if self.online: loss = calculate_online_loss(self.state_values, rewards) else: loss = calculate_loss(self.log_probs, self.state_values, rewards) self.optimizer.zero_grad() loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] del self.state_values[:] del self.board_samples[:] del self.legal_moves[:] return abs(loss.data)
def update(self): if not self.train: return None if len(self.log_probs) != len(self.rewards): raise abstract.PlayerException( "log_probs length must be equal to rewards length. Got %s - %s" % (len(self.log_probs), len(self.rewards))) rewards = self.discount_rewards(self.rewards, self.gamma) rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) # For now nothing to normalize, standard deviation = 0 policy_losses = [(-log_prob * reward) for log_prob, reward in zip(self.log_probs, rewards)] self.optimizer.zero_grad() policy_loss = torch.cat(policy_losses).sum() / len(policy_losses) policy_loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] return abs(policy_loss.data)
def update(self): # ---------------------- Error Logging ---------------------- # if not self.train: return None if len(self.log_probs) != len(self.rewards) or len( self.log_probs) != len(self.state_values): raise PlayerException( "log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len( self.rewards), len(self.state_values))) # ----------------------------------------------------------- # rewards = self.discount_rewards(self.rewards, self.gamma) rewards = self.rewards_baseline(rewards) rewards = config.make_variable(rewards) loss = self.calculate_loss(self.log_probs, self.state_values, rewards) self.optimizer.zero_grad() loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] del self.state_values[:] del self.board_samples[:] del self.legal_moves[:] return abs(loss.data)
def bootstrap_rewards(self): # TODO: Catch illegal use of this method pred_values = [ self.model(config.make_variable([self.board_samples[i]]), config.make_variable([self.legal_moves[i]]))[1].data[0, 0] for i in range(len(self.board_samples)) ] pred_values[-1] = self.rewards[-1] rewards = [ pred_values[i + 1] - pred_values[i] for i in range(len(pred_values) - 1) ] rewards.append(self.rewards[-1]) return rewards
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) probs, state_value = self.model(input, config.make_variable(legal_moves_map)) distribution = Categorical(probs) action = distribution.sample() log_prob = distribution.log_prob(action) move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE) if self.train: if self.online and self.state_values: self.online_policy_update(board_sample, legal_moves_map, log_prob) self.log_probs.append(log_prob) self.state_values.append(state_value[0]) self.board_samples.append(board_sample) self.legal_moves.append(legal_moves_map) return move
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) probs, state_value = self.model(input, config.make_variable(legal_moves_map)) try: # Hacky way of ensuring nonzero distribution distribution = Categorical(probs) action = distribution.sample() move = (action // config.BOARD_SIZE, action % config.BOARD_SIZE) except RuntimeError: print("Probs: \n%s \nBoard: \n%s \nLegal moves: \n%s" % (probs, board_sample, legal_moves_map)) if self.train: self.log_probs.append(distribution.log_prob(action)) self.state_values.append(state_value[0][0]) self.board_samples.append(board_sample) self.legal_moves.append(legal_moves_map) return move