def online_policy_update(self, board, legal_moves, logprob): """ Not Tested after PyTorch update""" new_value = self.model(config.make_variable([board]), config.make_variable([legal_moves]))[1].data[0, 0] reward = self.state_values[-1] - new_value loss = -logprob * reward self.optimizer.zero_grad() loss.backward(retain_graph=True) self.optimizer.step()
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) probs, state_value = self.model(input, config.make_variable(legal_moves_map)) distribution = Categorical(probs) action = distribution.sample() move = (int(action) // config.BOARD_SIZE, int(action) % config.BOARD_SIZE) if self.train: self.log_probs.append(distribution.log_prob(action)) return move
def update(self): # ---------------------- Error Logging ---------------------- # if not self.train: return 0 if len(self.log_probs) != len(self.rewards) or len(self.log_probs) != len(self.state_values): raise PlayerException("log_probs length must be equal to rewards length as well as state_values length. Got %s - %s - %s" % (len(self.log_probs), len(self.rewards), len(self.state_values))) rewards = self.bootstrap_rewards() rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) if self.online: loss = self.calculate_online_loss(self.state_values, rewards) else: loss = self.calculate_loss(self.log_probs, self.state_values, rewards) self.optimizer.zero_grad() loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] del self.state_values[:] del self.board_samples[:] del self.legal_moves[:] return abs(float(loss))
def update(self): if not self.train: return 0 if len(self.log_probs) != len(self.rewards): raise PlayerException( "log_probs length must be equal to rewards length. Got %s - %s" % (len(self.log_probs), len(self.rewards))) rewards = self.discount_rewards(self.rewards, self.gamma) rewards = config.make_variable(rewards) # rewards = self.normalize_rewards(rewards) # For now nothing to normalize, standard deviation = 0 policy_losses = [(-log_prob * reward) for log_prob, reward in zip(self.log_probs, rewards)] self.optimizer.zero_grad() policy_loss = torch.cat(policy_losses).sum() policy_loss.backward() self.optimizer.step() del self.rewards[:] del self.log_probs[:] return abs(float(policy_loss))
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) probs, state_value = self.model(input, config.make_variable(legal_moves_map)) distribution = Categorical(probs) action = distribution.sample() log_prob = distribution.log_prob(action) move = (int(action) // config.BOARD_SIZE, int(action) % config.BOARD_SIZE) if self.train: if self.online: self.online_policy_update(board_sample, legal_moves_map, log_prob) self.log_probs.append(log_prob) self.state_values.append(state_value[0]) self.board_samples.append(board_sample) self.legal_moves.append(legal_moves_map) return move
def evaluate(self, board_sample, legal_moves_map): input = config.make_variable([board_sample]) try: probs, state_value = self.model( input, config.make_variable(legal_moves_map)) distribution = Categorical(probs) action = distribution.sample() except RuntimeError: print("Invalid distribution. Board sample: \n%s" % board_sample) probs, state_value = self.model( input, config.make_variable(legal_moves_map)) distribution = Categorical(probs) action = distribution.sample() move = (int(action) // config.BOARD_SIZE, int(action) % config.BOARD_SIZE) if self.train: self.log_probs.append(distribution.log_prob(action)) self.state_values.append(state_value[0]) self.board_samples.append(board_sample) self.legal_moves.append(legal_moves_map) return move