def model_environment(opponent, state, action): game_complete = False initial_board = state file.write("AGENT MAKING MOVE: " + str(action) + str(board.to_state(action)) + "\n") current_board = p.add_move('X',action,initial_board) print("AFTER AGENT MOVE:") print(game.to_display_string(current_board)) file.write("AFTER AGENT MOVE:\n") file.write(game.to_display_string(current_board)) reward = 0.0 if p.is_winner(current_board,'X'): game_complete = True reward = 1.0 elif p.is_cat_game(current_board): game_complete = True reward = 0.0 if not game_complete: # let the opponent make a move ... (opponent_id, opponent_move) = opponent.pick_next_move(current_board) current_board = p.add_move(opponent_id, opponent_move, current_board) print("AFTER OPPONENT MOVE") print(game.to_display_string(current_board)) file.write("AFTER OPPONENT MOVE\n") file.write(game.to_display_string(current_board)) if p.is_winner(current_board,opponent_id): game_complete = True reward = -1.0 elif p.is_cat_game(current_board): game_complete = True reward = 0 return current_board, reward, game_complete
def play_game(p1, p2, file=None): board = p.empty_board() players = [p1, p2] current_player_index = 0 winner = None move_count = 0 while (True): print("Current move is for player: ", players[current_player_index].player) if (file is not None): file.write("PRIOR TO MOVE " + str(move_count) + " ------------\n") file.write(to_display_string(board)) if p.is_cat_game(board): if (file is not None): file.write("RESULT IS CAT GAME") break m = players[current_player_index].pick_next_move(board) board = p.add_move(m[0], m[1], board) p.display_board(board) move_count += 1 if p.is_winner(board, players[current_player_index].player): winner = players[current_player_index] if (file is not None): file.write("FINAL BOARD AFTER MOVE " + str(move_count) + " WINNER IS: " + winner.player + "\n") file.write(to_display_string(board)) break # alternate players if (current_player_index == 0): print("Switching to player 1...") current_player_index = 1 else: print("Switching to player 0...") current_player_index = 0 if (winner is None): print("CAT GAME") else: print("WINNER IS PLAYER: ", winner.player) return winner
def find_actions_for_state(self, board): p.display_board(board) # given board, find available / unoccupied cells # this defines the next possible moves available_cells = p.get_available_cells(board) print("Found ", len(available_cells), " available cells --> ", available_cells) # create the next board actions = [] for cell in available_cells: bnext = p.add_move(self.player, cell, board) action = {} action['cell'] = cell action['board'] = bnext actions.append(action) return actions
def update_q_value(self, board, action, reward): self.log.write("UPDATING Q VALUE FOR BOARD ------\n") self.log.write(self.to_string(board)) self.log.write("GIVEN ACTION: " + str(action['cell']) + "\n") # previous_value = self.get_value(board) previous_value = self.get_value_for_state_and_action( board, action['cell']) self.log.write("PREVIOUS VALUE FOR STATE/ACTION: " + str(previous_value) + "\n") self.log.flush() # approximate FUTURE reward from the NEXT states ... # max_a Q(s',a) # first we advance from s --> s' given action A and reward R # then we take the max over all actions FROM s' # do we need to approximate the OTHER player's actions here?? # otherwise, the board is OUT OF SYNC with GAME PLAY next_board = p.copy(action['board']) # examine other player moves --> for now, just do a random move?? opponent_possible_cells = p.get_available_cells(next_board) if len(opponent_possible_cells) > 0: self.log.write("POSSIBLE OPPONENT NEXT MOVES: " + str(opponent_possible_cells) + "\n") # pick one at random inx_select = np.random.randint(0, len(opponent_possible_cells)) opponent_move = opponent_possible_cells[inx_select] self.log.write("ASSUME OPPONENT NEXT MOVE: " + str(opponent_move) + "\n") next_board_seen = p.add_move(p.get_other_player(self.player), opponent_move, next_board) self.log.write("ASSUMED NEXT BOARD SEEN ----\n") self.log.write(self.to_string(next_board_seen)) self.log.flush() next_actions = self.find_actions_for_state(next_board_seen) if (len(next_actions) > 0): next_q_values = self.find_q_values(next_actions) for i in range(0, len(next_actions)): action = next_actions[i] self.log.write("Q VALUE FOR ACTION " + str(i) + " --> " + str(action['cell']) + " : " + str(next_q_values[i]) + "\n") print("Got q values for actions: ", next_q_values) max_next_q = max(next_q_values) else: max_next_q = 0 else: print("EMPTY POSSIBLE OPPONENT NEXT CELLS") max_next_q = 0 next_value = previous_value + self.learning_rate * ( reward + self.gamma * max_next_q - previous_value) print("Updated q value ==> ", next_value) self.add_or_update_value(board, next_value) self.update_q_value_for_state_and_action(board, action['cell'], next_value)