def get_normalizer(self, state, Q): possible_actions = util.possible_moves(state, 'opponent') sum = 0 for action in possible_actions: sum += ref.ref_opponent( state, action, len(possible_actions)) * math.exp( self.beta_estimation * Q.get_Q_opponent(state, action)) return sum / 1
def choose_move(self, state, Q): possible_actions = util.possible_moves(state) use_greedy = self.epsilon_greedy() if use_greedy: return self.get_best_action_based_on_Q(Q, state, possible_actions) else: return possible_actions[random.randint(0, len(possible_actions) - 1)]
def get_Q_opponent(self, state, action): possible_actions_player = util.possible_moves(state, player='player') sum = 0 for action_pl in possible_actions_player: sum += (self.player.get_reference(state, action_pl, len(possible_actions_player)) * math.exp(self.player.beta * self.values.get(state, {}).get( (action_pl, action), 0))) return math.log(sum) / self.player.beta
def estimate_v_new_state(self, new_state): possible_actions_player = util.possible_moves(new_state) sum = 0 for action in possible_actions_player: Q_player = self.get_Q_player(new_state, action, self.player.use_estimation, self.player.get_beta_estimation()) sum += (self.player.get_reference(new_state, action, len(possible_actions_player)) * math.exp(self.player.beta * Q_player)) return math.log(sum) / self.player.beta
def get_Q_player(self, state, action, use_estimation, beta_estimation): possible_actions_opponent = util.possible_moves(state, player='opponent') sum = 0 if use_estimation: beta = beta_estimation else: beta = self.opponent.beta for action_op in possible_actions_opponent: sum += (self.opponent.get_reference( state, action_op, len(possible_actions_opponent)) * math.exp(beta * self.values.get(state, {}).get( (action, action_op), 0))) return math.log(sum) / beta
def get_gradient(self, state, action, Q): b = self.beta_estimation possible_actions = util.possible_moves(state, 'opponent') Qs = [] for possible_action in possible_actions: Qs.append(Q.get_Q_opponent(state, possible_action)) Q_action = Q.get_Q_opponent(state, action) upper_left = 0 for i in range(len(Qs)): upper_left += Qs[i] * math.exp(Qs[i] * b) upper_left_divisor = 0 for i in range(len(Qs)): upper_left_divisor += math.exp(Qs[i] * b) return upper_left / upper_left_divisor - Q_action