from gameplay import play_game
from policies import RandomPolicy, MCTSPolicy
import numpy as np
import networkx as nx

player_policies = [MCTSPolicy(), RandomPolicy()]

# For reproducibility
np.random.seed(0)

games = []
for i in range(100):
    games.append(play_game(player_policies))

graphs = [game[0] for game in games]
dot_graph_combined = nx.compose_all(graphs)
dot_graph = nx.to_pydot(dot_graph_combined)
dot_graph.set_graph_defaults(fontname='Courier')
dot_graph.write_png('multiple_game_graph.png')
Esempio n. 2
0
class Game():
    GAME_STATUS = ["Playing", "End", "Draw"]

    def __init__(self, players, turn_id, board):

        self.board = board
        ## to track of whose turn is now
        self.players = players
        self.turn_id = turn_id
        self.status = Game.GAME_STATUS[0]
        self.turn = self.players[self.turn_id]
        self.flag_for_drawing_canvas = False

        # in case a move needs to be made through Random
        self.random_policy = RandomPolicy()
        ## MCTSPolicy(a, b) -- a is player, b is for an opponent

        self.mctsObj_X = MCTSPolicy(self.players[0],
                                    self.players[1],
                                    board=self.board)
        self.mctsObj_O = MCTSPolicy(self.players[1],
                                    self.players[0],
                                    board=self.board)

        self.mctsObjs = [self.mctsObj_X, self.mctsObj_O]
        """
        model_dir = "./analysis-tools/models_ex/"
        model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
        model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
        """
        self.model_based_policy = None  # ModelPolicy(model_dir, model_w_file, model_json_file)
        for each_player in self.players:
            print(each_player.get_policy_mode())
            if each_player.get_policy_mode() == "MODEL":
                model_dir = "./analysis-tools/models_ex/"
                #model_w_file = "model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925weights.h5"
                #model_json_file ="model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925in_json.json"

                # Second best
                #model_w_file =  "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
                #model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
                # very good  Best
                #model_w_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_weights.h5"
                #model_json_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_in_json.json"
                # third good
                #model_json_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_in_json.json"
                #model_w_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_weights.h5"

                ## LOOKS best so far.. waiting to be done
                model_json_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_in_json.json"
                model_w_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_weights.h5"

                # Done
                model_json_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_in_json.json"
                model_w_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_weights.h5"

                #
                model_json_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_in_json.json"
                model_w_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_weights.h5"

                #model_w_file ="model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_weights.h5"
                #model_json_file = "model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_in_json.json"

                # Done -- 1K weighted for preventing lose
                #model_w_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_weights.h5"
                #model_json_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_in_json.json"

                # Done
                model_w_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_weights.h5"
                model_json_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_in_json.json"

                # DOne -- below are from a buggy weighting scheme..
                model_json_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_in_json.json"
                model_w_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_weights.h5"

                # WORST SO FAR
                model_w_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_weights.h5"
                model_json_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_in_json.json"

                # BEST SO FAR
                model_json_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_in_json.json"
                model_w_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_weights.h5"
                # DONE
                model_json_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_in_json.json"
                model_w_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_weights.h5"
                # DONE
                #model_json_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_in_json.json"
                #model_w_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_weights.h5"
                model_obj = each_player.get_model_obj()
                #self.model_based_policy = ModelPolicy(model_obj) #model_dir, model_w_file, model_json_file)
                break
        self.game_id = uuid.uuid1()

    def show_progress_on_canvas(self, a_boolean_flag):
        self.flag_for_drawing_canvas = a_boolean_flag

    def set_to_next_player(self):
        next_turn = (self.turn_id + 1) % len(self.players)
        self.turn_id = next_turn
        self.turn = self.players[self.turn_id]

    def is_end(self):
        if self.is_draw():
            return True

        for each_player in self.players:
            if self.check_end_status(each_player):
                return True
        return False

    def check_end_status(self, a_player):
        if self.board.is_win(a_player):
            return True
        return False

    def get_input(self):
        prompt = "%s 's Turn\n" % (self.turn)
        input_from_user = input(prompt)
        r_c_in_list = input_from_user.split("_")
        r, c = r_c_in_list[0], r_c_in_list[1]
        r_int = ord(r) - ord('a')
        c_int = int(c)
        return r_int, c_int

    def validate_input(self):
        available_pos = self.board.get_available_positions()
        while True:
            r, c = self.get_input()
            if available_pos.get((r, c), 0) == 1:
                break
            print("Try again. Your input")
        return r, c

    # def convert_sequence_moves_to_vector(self):
    #     individual_sequence = [0] * 9
    #     for item in self.board.sequences_of_movements:
    #         turn_for_this_move = item.get("turn")
    #         move_made_for_this_move = item.get("position")
    #         individual_sequence[move_made_for_this_move - 1] = 1 if turn_for_this_move == "X" else 2
    #
    #     return np.array([individual_sequence])

    def play_game(self):
        turn_id = 0
        game_log = {
            'game_uuid': self.get_game_id(),
            'play_modes': {
                'X': self.players[0].get_policy_mode(),
                'O': self.players[1].get_policy_mode()
            },
            'board_size': self.board.row,
            'winner': "",
            'sequence': {}
        }
        canvas_for_drawing = None

        if self.flag_for_drawing_canvas:
            #turtle.setup(500, 500)
            canvas_for_drawing = Draw()
        is_draw_gametie = False
        """
        # Below block must be gone
        # from model_loader import ModelBasedAgent
        # model_dir = "./analysis-tools/models_ex/"
        # model_w_file = model_dir + "current_best.h5" #"model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
        # model_json_file = model_dir + "current_best.json" #model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
        # model_agent_obj = ModelBasedAgent(model_w_file, model_json_file)
        # mlModel = model_agent_obj.get_model()
        """
        while self.check_end_status(self.turn) != True:
            print(self.board)
            test_instance = self.board.convert_sequence_moves_to_vector()
            #print(test_instance)
            if self.turn.get_player_type() == Player.PTYPE_HUMAN:
                # TODO -- this part is just to make a simplified interface of modelbased movement
                # later, this will be the part of Policy as a ModelPolicy class
                # for now, we assume player O would be model.. as X is always starting first

                #test_instance = np.array([an_instance])

                #prediction_move = mlModel.predict_proba(test_instance)[0]
                #pp = model_agent_obj.predict_proba(test_instance)[0]
                #UT.print_three_arrays(test_instance[0], pp, prediction_move)
                #move_by_prediction = np.argmax(pp) + 1
                #r_e, c_e = self.board.indices_to_coordinate(move_by_prediction)
                #print("R:%d C:%d \t i_e:%d R_e:%d C_e:%d" % (r_v, c_v, move_by_prediction, r_e, c_e))
                r_v, c_v = self.validate_input()

            else:  # when Player is an agent
                if self.turn.get_policy_mode() == "MODEL":
                    model_structure = 3  # 0 for regular, 1 for two tower, 2 for conv2d, 3 for conv2d+twoTowers
                    r_v, c_v = self.turn.model_based_policy.move(
                        self.board, model_structure)
                elif self.turn.get_policy_mode() == "MCTS":
                    if self.turn.get_marker() == "O":
                        r_v, c_v = self.mctsObj_O.move(self.board)
                        # TODO -- this part is just to make a simplified interface of modelbased movement
                        # This could be a place for ModelBased action
                    elif self.turn.get_marker() == "X":
                        if self.turn.get_policy_mode() == "RANDOM":
                            self.random_policy = RandomPolicy()
                            r_v, c_v = self.random_policy.move(self.board)
                            # print("AM I HERE FOR RANDOM")
                        else:
                            r_v, c_v = self.mctsObj_X.move(self.board)
                elif self.turn.get_policy_mode() == "RANDOM":
                    self.random_policy = RandomPolicy()
                    r_v, c_v = self.random_policy.move(self.board)

            self.board.set_a_move(r_v, c_v, self.turn)
            UT.print_as_log(self.board.get_available_positions())
            ## Drawing on canvas
            if self.flag_for_drawing_canvas:
                canvas_for_drawing.move_and_draw(r_v, c_v,
                                                 self.turn.get_marker())

            if self.check_end_status(self.turn):
                print("FinalResult: %s" % (self.turn.get_marker()))
                print(self.board)
                print(self.board.convert_sequence_moves_to_vector())
                #UT.print_as_log("Winning and so ending this game")
                UT.print_as_log(self.board.sequences_of_movements)
                game_log['winner'] = self.turn.get_marker()
                game_log['sequence'] = self.board.sequences_of_movements

                break

            elif self.is_draw():
                is_draw_gametie = True
                print("FinalResult: Draw")
                #UT.print_as_log("Draw.... so, exiting the game")
                print(self.board)
                print(self.board.convert_sequence_moves_to_vector())
                game_log['winner'] = "D"
                game_log['sequence'] = self.board.sequences_of_movements
                break
            else:
                self.set_to_next_player()

        ## for writing a message to the canvas
        if self.flag_for_drawing_canvas:
            result_message = "Game result -- Winner is %s" % (
                game_log.get("winner"))
            if is_draw_gametie:
                result_message = "Game result :  Draw"
            canvas_for_drawing.write_text(result_message)
            canvas_for_drawing.exit_on_click()

            #canvas_for_drawing.reset_canvas()
            #turtle.TurtleScreen._RUNNING = True
        json_str = game_log  #json.dumps(game_log)
        return json_str

    def a_move_for_agent(self):
        r, c = self.a_move_for_agent_helper()
        return r, c

    ## this is the function for an agent to come up with a smarter decision
    def a_move_for_agent_helper(self):
        all_available_positions_dict = self.board.get_available_positions()
        random_move_index = np.random.randint(
            0, len(all_available_positions_dict), 1)[0]
        r, c = list(all_available_positions_dict.keys())[random_move_index]
        return r, c

    def is_draw(self):
        if len(self.board.get_available_positions()) < 1:
            return True
        return False

    @staticmethod
    def load_a_game(afile):
        move_sequences = UT.read_a_game(afile)
        if move_sequences:
            Game.parse_history(move_sequences)

    @staticmethod
    def parse_history(adict, message_str=None):

        winner = adict.get("winner", None)
        if winner == None:
            print("Something is wrong")
            sys.exit(1)
        move_sequences = adict.get("sequence", None)
        turtle.hideturtle()
        board_obj_from_history = Board(3, 3, 3)
        # below obj is for drawing the board on a canvas.
        # if you don't like, you can make it comment
        draw_board_obj = Draw()
        for each_move in move_sequences:
            player_marker = each_move.get("turn")
            r_index, c_index = each_move.get("xy")
            p = Player("test", player_marker, 1)
            board_obj_from_history.set_a_move(r_index, c_index, p)
            draw_board_obj.move_and_draw(r_index, c_index, player_marker)
            print(board_obj_from_history)

        draw_board_obj.write_text(
            ("Winner is:  %s -- sampled %s" % (winner, str(message_str))))
        time.sleep(3)
        draw_board_obj.turtle_obj.getpen().clear()
        draw_board_obj.turtle_obj.getscreen().clearscreen()

        # draw_board_obj.exit_on_click()
        # or

    def get_game_id(self):
        return str(self.game_id)
Esempio n. 3
0
    def __init__(self, players, turn_id, board):

        self.board = board
        ## to track of whose turn is now
        self.players = players
        self.turn_id = turn_id
        self.status = Game.GAME_STATUS[0]
        self.turn = self.players[self.turn_id]
        self.flag_for_drawing_canvas = False

        # in case a move needs to be made through Random
        self.random_policy = RandomPolicy()
        ## MCTSPolicy(a, b) -- a is player, b is for an opponent

        self.mctsObj_X = MCTSPolicy(self.players[0],
                                    self.players[1],
                                    board=self.board)
        self.mctsObj_O = MCTSPolicy(self.players[1],
                                    self.players[0],
                                    board=self.board)

        self.mctsObjs = [self.mctsObj_X, self.mctsObj_O]
        """
        model_dir = "./analysis-tools/models_ex/"
        model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
        model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
        """
        self.model_based_policy = None  # ModelPolicy(model_dir, model_w_file, model_json_file)
        for each_player in self.players:
            print(each_player.get_policy_mode())
            if each_player.get_policy_mode() == "MODEL":
                model_dir = "./analysis-tools/models_ex/"
                #model_w_file = "model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925weights.h5"
                #model_json_file ="model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925in_json.json"

                # Second best
                #model_w_file =  "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
                #model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
                # very good  Best
                #model_w_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_weights.h5"
                #model_json_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_in_json.json"
                # third good
                #model_json_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_in_json.json"
                #model_w_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_weights.h5"

                ## LOOKS best so far.. waiting to be done
                model_json_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_in_json.json"
                model_w_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_weights.h5"

                # Done
                model_json_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_in_json.json"
                model_w_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_weights.h5"

                #
                model_json_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_in_json.json"
                model_w_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_weights.h5"

                #model_w_file ="model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_weights.h5"
                #model_json_file = "model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_in_json.json"

                # Done -- 1K weighted for preventing lose
                #model_w_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_weights.h5"
                #model_json_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_in_json.json"

                # Done
                model_w_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_weights.h5"
                model_json_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_in_json.json"

                # DOne -- below are from a buggy weighting scheme..
                model_json_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_in_json.json"
                model_w_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_weights.h5"

                # WORST SO FAR
                model_w_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_weights.h5"
                model_json_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_in_json.json"

                # BEST SO FAR
                model_json_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_in_json.json"
                model_w_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_weights.h5"
                # DONE
                model_json_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_in_json.json"
                model_w_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_weights.h5"
                # DONE
                #model_json_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_in_json.json"
                #model_w_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_weights.h5"
                model_obj = each_player.get_model_obj()
                #self.model_based_policy = ModelPolicy(model_obj) #model_dir, model_w_file, model_json_file)
                break
        self.game_id = uuid.uuid1()
from utils.misc import sample_episode, sample_step
from policies import SimpleBlackjackPolicy, RandomPolicy, create_epsilon_greedy_nchain_policy
import matplotlib.pyplot as plt

# Default variables

alphas = [0.001]
td_n = 1
actions = [0, 1]
n = 5
env = NChainEnv(n=n, slip=0.0)

# Global settings

target_policy = create_epsilon_greedy_nchain_policy(n, 0.001)
behavior_policy = RandomPolicy(actions)

n_experiments = 10

save_every = 1e3  ### How often we should save the results

# Conf for mc
n_mc_run = int(3e5)
save_every_mc = n_mc_run

# Conf for the mc off policy

n_mc_off_policy = int(3e5)

### Here we create the names
name = "{}Chain".format(n)
Esempio n. 5
0
    def play_game(self):
        turn_id = 0
        game_log = {
            'game_uuid': self.get_game_id(),
            'play_modes': {
                'X': self.players[0].get_policy_mode(),
                'O': self.players[1].get_policy_mode()
            },
            'board_size': self.board.row,
            'winner': "",
            'sequence': {}
        }
        canvas_for_drawing = None

        if self.flag_for_drawing_canvas:
            #turtle.setup(500, 500)
            canvas_for_drawing = Draw()
        is_draw_gametie = False
        """
        # Below block must be gone
        # from model_loader import ModelBasedAgent
        # model_dir = "./analysis-tools/models_ex/"
        # model_w_file = model_dir + "current_best.h5" #"model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5"
        # model_json_file = model_dir + "current_best.json" #model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json"
        # model_agent_obj = ModelBasedAgent(model_w_file, model_json_file)
        # mlModel = model_agent_obj.get_model()
        """
        while self.check_end_status(self.turn) != True:
            print(self.board)
            test_instance = self.board.convert_sequence_moves_to_vector()
            #print(test_instance)
            if self.turn.get_player_type() == Player.PTYPE_HUMAN:
                # TODO -- this part is just to make a simplified interface of modelbased movement
                # later, this will be the part of Policy as a ModelPolicy class
                # for now, we assume player O would be model.. as X is always starting first

                #test_instance = np.array([an_instance])

                #prediction_move = mlModel.predict_proba(test_instance)[0]
                #pp = model_agent_obj.predict_proba(test_instance)[0]
                #UT.print_three_arrays(test_instance[0], pp, prediction_move)
                #move_by_prediction = np.argmax(pp) + 1
                #r_e, c_e = self.board.indices_to_coordinate(move_by_prediction)
                #print("R:%d C:%d \t i_e:%d R_e:%d C_e:%d" % (r_v, c_v, move_by_prediction, r_e, c_e))
                r_v, c_v = self.validate_input()

            else:  # when Player is an agent
                if self.turn.get_policy_mode() == "MODEL":
                    model_structure = 3  # 0 for regular, 1 for two tower, 2 for conv2d, 3 for conv2d+twoTowers
                    r_v, c_v = self.turn.model_based_policy.move(
                        self.board, model_structure)
                elif self.turn.get_policy_mode() == "MCTS":
                    if self.turn.get_marker() == "O":
                        r_v, c_v = self.mctsObj_O.move(self.board)
                        # TODO -- this part is just to make a simplified interface of modelbased movement
                        # This could be a place for ModelBased action
                    elif self.turn.get_marker() == "X":
                        if self.turn.get_policy_mode() == "RANDOM":
                            self.random_policy = RandomPolicy()
                            r_v, c_v = self.random_policy.move(self.board)
                            # print("AM I HERE FOR RANDOM")
                        else:
                            r_v, c_v = self.mctsObj_X.move(self.board)
                elif self.turn.get_policy_mode() == "RANDOM":
                    self.random_policy = RandomPolicy()
                    r_v, c_v = self.random_policy.move(self.board)

            self.board.set_a_move(r_v, c_v, self.turn)
            UT.print_as_log(self.board.get_available_positions())
            ## Drawing on canvas
            if self.flag_for_drawing_canvas:
                canvas_for_drawing.move_and_draw(r_v, c_v,
                                                 self.turn.get_marker())

            if self.check_end_status(self.turn):
                print("FinalResult: %s" % (self.turn.get_marker()))
                print(self.board)
                print(self.board.convert_sequence_moves_to_vector())
                #UT.print_as_log("Winning and so ending this game")
                UT.print_as_log(self.board.sequences_of_movements)
                game_log['winner'] = self.turn.get_marker()
                game_log['sequence'] = self.board.sequences_of_movements

                break

            elif self.is_draw():
                is_draw_gametie = True
                print("FinalResult: Draw")
                #UT.print_as_log("Draw.... so, exiting the game")
                print(self.board)
                print(self.board.convert_sequence_moves_to_vector())
                game_log['winner'] = "D"
                game_log['sequence'] = self.board.sequences_of_movements
                break
            else:
                self.set_to_next_player()

        ## for writing a message to the canvas
        if self.flag_for_drawing_canvas:
            result_message = "Game result -- Winner is %s" % (
                game_log.get("winner"))
            if is_draw_gametie:
                result_message = "Game result :  Draw"
            canvas_for_drawing.write_text(result_message)
            canvas_for_drawing.exit_on_click()

            #canvas_for_drawing.reset_canvas()
            #turtle.TurtleScreen._RUNNING = True
        json_str = game_log  #json.dumps(game_log)
        return json_str
"""
Plays many games and then plots the cumulative win rates of the players.

The player policies can be chosen from MCTS and Random.
"""

from gameplay import play_game
from policies import RandomPolicy, MCTSPolicy
from visualization import visualize_mcts_tree
import networkx as nx
import numpy as np

# Choose the player policies here:
MCTS_vs_Random = [MCTSPolicy(player='X'), RandomPolicy()]
Random_vs_MCTS = [RandomPolicy(), MCTSPolicy(player='O')]
MCTS_vs_MCTS = [MCTSPolicy(player='X'), MCTSPolicy(player='O')]
Random_vs_Random = [RandomPolicy(), RandomPolicy()]

experiments = [[MCTSPolicy(player='X'), RandomPolicy()],
               [MCTSPolicy(player='X'), RandomPolicy()],
               [RandomPolicy(), MCTSPolicy(player='O')],
               [RandomPolicy(), MCTSPolicy(player='O')],
               [MCTSPolicy(player='X'),
                MCTSPolicy(player='O')],
               [MCTSPolicy(player='X'),
                MCTSPolicy(player='O')], [RandomPolicy(),
                                          RandomPolicy()],
               [RandomPolicy(), RandomPolicy()]]

names = [
    'x_mcts_vs_o_random_1', 'x_mcts_vs_o_random_2', 'x_random_vs_o_mcts_1',
Esempio n. 7
0
def main(args, logdir):
    """
    Model Based Reinforcement Learning
    1) Generate random trajectories
    2) Train the model on the generated data
    3) For each repetition:
        a) Generate new data using the MPC controller
        b) Retrain the model using the new data and the old data
        c) (Optional) Compute Mean Prediction Error
    """

    # SETUP
    train_envs = []
    test_envs = []
    if args.no_sunblaze:
        train_env = gym.make(args.env_name)
        test_env = gym.make(args.env_name)

        if 'PyBullet' in args.env_name and args.render:
            train_env.render()
            train_env.reset()

    elif args.test_type == 'interpolation':
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))
        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

    elif args.test_type == 'extrapolation':
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + '-v0'))
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name +
                               'RandomExtreme-v0'))
        test_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0'))

    else:
        train_envs.append(
            sunblaze_envs.make('Sunblaze' + args.env_name + '-v0'))
        test_envs.append(sunblaze_envs.make('Sunblaze' + args.env_name +
                                            '-v0'))

    test_cnt = 0
    for train_env in train_envs:

        assert isinstance(train_env.observation_space, gym.spaces.Box)

        start_time = time.time()
        logger = Logger(logdir)

        is_discrete = isinstance(train_env.action_space, gym.spaces.Discrete)

        ob_dim = train_env.observation_space.shape[0]
        ac_dim = train_env.action_space.n if is_discrete else train_env.action_space.shape[
            0]

        reward_function = get_reward_function(train_env)

        train_env.reset()
        ensemble = Ensemble(ob_dim,
                            ac_dim,
                            is_discrete,
                            args.pnn,
                            args.ensemble_size,
                            args.lr,
                            args.hidden_size,
                            device=nn_utils.DEVICE)

        # TRAIN
        # Instantiate policies
        mpc_policy = MPCPolicy(args, train_env, ensemble, reward_function,
                               nn_utils.DEVICE)
        random_policy = RandomPolicy(train_env)

        # Instantiate Data generator
        data_generator = DataGenerator(args,
                                       train_env,
                                       nn_utils.DEVICE,
                                       mpc_policy,
                                       random_policy,
                                       max_size=args.max_memory_size)

        if args.weights_paths is not None:
            # If weights are given, visualize and quit
            ensemble.load_weights(args.weights_paths)

            current_episodes, rewards, lengths = data_generator.generate_closed_loop_data(
                args.render)
            if args.mpe:
                MPE(train_env,
                    current_episodes,
                    ensemble,
                    args.mpc_horizon,
                    label='Ensemble %s' % (args.weights_paths))
            print('avg reward episode %f' % (np.mean(rewards)))
            print('avg len %f' % (np.mean([len(ep)
                                           for ep in current_episodes])))
            return

        # Otherwise train model on random trajectories
        current_episodes, train_rewards, train_lengths = data_generator.generate_random_data(
        )

        # Train initial model using random trajectories
        train_loss, test_loss = ensemble.train_net(
            args.epochs_rand,
            args.batch_size,
            data_generator,
            samples_per_model=args.samples_per_model)

        if args.mpe:
            print('Computing MPE')
            for (i, model) in enumerate(ensemble.models):
                MPE(train_env,
                    current_episodes,
                    model,
                    args.mpc_horizon,
                    label='random data, model %d' % (i))
            if len(ensemble.models) > 1:
                MPE(train_env,
                    current_episodes,
                    ensemble,
                    args.mpc_horizon,
                    label='random data, ensemble')

        _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data(
            render=args.render)

        # TODO: keep test data only for test data
        for itr in range(args.repetitions):
            print('\nMPC Repetition %d / %d \n' % (itr + 1, args.repetitions))
            epsilon = mpc_policy.update_epsilon(itr)
            perform_logging(itr, logger, eval_rewards, train_rewards,
                            test_loss, train_loss, eval_lengths, train_lengths,
                            start_time, epsilon)
            current_episodes, train_rewards, train_lengths = data_generator.generate_closed_loop_data(
            )

            train_loss, test_loss = ensemble.train_net(
                args.epochs_rl,
                args.batch_size,
                data_generator,
                samples_per_model=args.samples_per_model)

            if args.mpe:
                print('Computing MPE')
                for (i, model) in enumerate(ensemble.models):
                    MPE(train_env,
                        current_episodes,
                        model,
                        args.mpc_horizon,
                        label='rep %d, model %d' % (itr, i))
                if len(ensemble.models) > 1:
                    MPE(train_env,
                        current_episodes,
                        ensemble,
                        args.mpc_horizon,
                        label='rep %d, ensemble' % (itr))

            _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data(
                render=args.render)

            if args.save_model:
                for (i, model) in enumerate(ensemble.models):
                    save_file = '%s/models/rep_%d_model_%d_%.4f.pt' % (
                        str(logdir), itr, i, test_loss[i][-1])
                    torch.save(model.state_dict(), save_file)

        # SUNBLAZE TEST
        for test_env in test_envs:
            test_name = test_env.unwrapped.spec.id
            train_name = train_env.unwrapped.spec.id
            if test_cnt < 3:
                print('\nTESTING: ' + train_name + ' on ' + test_name,
                      flush=True)
                success_function = get_success_function(test_env)
                num_success = 0
                rewards = []
                for ep_num in range(args.test_episodes):
                    success, ep_reward = run_test_episode(
                        test_env, mpc_policy, success_function, args.render)
                    rewards.append(ep_reward)
                    num_success += int(success)
                    print(
                        'Test episode: %2d / %2d \t Success: %d \t Reward: %d'
                        % (ep_num + 1, args.test_episodes, int(success),
                           ep_reward),
                        flush=True)

                score = num_success / args.test_episodes * 100
                logger.log_scalar(score, test_name + '-' + train_name, 0)
                with open(train_name + '_' + test_name + '_score.txt',
                          'w+') as f:
                    f.write('Score for ' + train_name + ' tested on ' +
                            test_name + ': ' + str(score))

                print('\nScore for ' + train_name + ' tested on ' + test_name +
                      ' testing: ',
                      score,
                      flush=True)
                test_cnt += 1
Esempio n. 8
0
"""
Generates sample figures visualizing game trees.

First, it generates one game graph and saves it to 'game_graph.png'

Then, it plays multiple games, and composes their graphs and saves it to
'multiple_game_graph.png'

Requires the NetworkX graph package and GraphViz, which are included in
Anaconda
"""
from gameplay import play_game
from policies import RandomPolicy
import networkx as nx

player_policies = [RandomPolicy(), RandomPolicy()]
G = play_game(player_policies)

dot_graph = nx.to_pydot(G)
dot_graph.set_graph_defaults(fontname='Courier')
dot_graph.write_png('game_graph.png')

games = []

for i in range(30):
    games.append(play_game(player_policies))

dot_graph_combined = nx.compose_all(games)
dot_graph = nx.to_pydot(dot_graph_combined)
dot_graph.set_graph_defaults(fontname='Courier')
dot_graph.write_png('multiple_game_graph.png')
Esempio n. 9
0
import numpy as np
from tqdm import trange

from policies import RandomPolicy

_ENVS = ['CartPole-v1', 'MountainCar-v0', 'Acrobot-v1', 'MountainCarContinuous-v0', 'Pendulum-v0']

if __name__ == "__main__":
    logging.getLogger().setLevel(logging.INFO)
    parser = argparse.ArgumentParser()
    parser.add_argument('--env_name', '-e', type=str, default='CartPole-v1', choices=_ENVS)
    args = parser.parse_args()

    logging.info('Making env {}'.format(args.env_name))
    env = gym.make(args.env_name)
    pi = RandomPolicy(env.observation_space, env.action_space)

    all_obs = []
    for _ in trange(100):
        obs = env.reset()
        all_obs.append(obs)
        while True:
            obs, _, done, _ = env.step(pi(obs))
            all_obs.append(obs)

            if done: break

    std_obs = np.std(all_obs, axis=0)
    print(repr(std_obs))
    import IPython; IPython.embed(); exit(0)
    
Esempio n. 10
0
def set_policies(policies_name, user_segment, user_features, n_playlists):
    # Please see section 3.3 of RecSys paper for a description of policies
    POLICIES_SETTINGS = {
        'random':
        RandomPolicy(n_playlists),
        'etc-seg-explore':
        ExploreThenCommitSegmentPolicy(user_segment,
                                       n_playlists,
                                       min_n=100,
                                       cascade_model=True),
        'etc-seg-exploit':
        ExploreThenCommitSegmentPolicy(user_segment,
                                       n_playlists,
                                       min_n=20,
                                       cascade_model=True),
        'epsilon-greedy-explore':
        EpsilonGreedySegmentPolicy(user_segment,
                                   n_playlists,
                                   epsilon=0.1,
                                   cascade_model=True),
        'epsilon-greedy-exploit':
        EpsilonGreedySegmentPolicy(user_segment,
                                   n_playlists,
                                   epsilon=0.01,
                                   cascade_model=True),
        'kl-ucb-seg':
        KLUCBSegmentPolicy(user_segment, n_playlists, cascade_model=True),
        'ts-seg-naive':
        TSSegmentPolicy(user_segment,
                        n_playlists,
                        alpha_zero=1,
                        beta_zero=1,
                        cascade_model=True),
        'ts-seg-pessimistic':
        TSSegmentPolicy(user_segment,
                        n_playlists,
                        alpha_zero=1,
                        beta_zero=99,
                        cascade_model=True),
        'ts-lin-naive':
        LinearTSPolicy(user_features,
                       n_playlists,
                       bias=0.0,
                       cascade_model=True),
        'ts-lin-pessimistic':
        LinearTSPolicy(user_features,
                       n_playlists,
                       bias=-5.0,
                       cascade_model=True),
        # Versions of epsilon-greedy-explore and ts-seg-pessimistic WITHOUT cascade model
        'epsilon-greedy-explore-no-cascade':
        EpsilonGreedySegmentPolicy(user_segment,
                                   n_playlists,
                                   epsilon=0.1,
                                   cascade_model=False),
        'ts-seg-pessimistic-no-cascade':
        TSSegmentPolicy(user_segment,
                        n_playlists,
                        alpha_zero=1,
                        beta_zero=99,
                        cascade_model=False)
    }

    return [POLICIES_SETTINGS[name] for name in policies_name]