from gameplay import play_game from policies import RandomPolicy, MCTSPolicy import numpy as np import networkx as nx player_policies = [MCTSPolicy(), RandomPolicy()] # For reproducibility np.random.seed(0) games = [] for i in range(100): games.append(play_game(player_policies)) graphs = [game[0] for game in games] dot_graph_combined = nx.compose_all(graphs) dot_graph = nx.to_pydot(dot_graph_combined) dot_graph.set_graph_defaults(fontname='Courier') dot_graph.write_png('multiple_game_graph.png')
class Game(): GAME_STATUS = ["Playing", "End", "Draw"] def __init__(self, players, turn_id, board): self.board = board ## to track of whose turn is now self.players = players self.turn_id = turn_id self.status = Game.GAME_STATUS[0] self.turn = self.players[self.turn_id] self.flag_for_drawing_canvas = False # in case a move needs to be made through Random self.random_policy = RandomPolicy() ## MCTSPolicy(a, b) -- a is player, b is for an opponent self.mctsObj_X = MCTSPolicy(self.players[0], self.players[1], board=self.board) self.mctsObj_O = MCTSPolicy(self.players[1], self.players[0], board=self.board) self.mctsObjs = [self.mctsObj_X, self.mctsObj_O] """ model_dir = "./analysis-tools/models_ex/" model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" """ self.model_based_policy = None # ModelPolicy(model_dir, model_w_file, model_json_file) for each_player in self.players: print(each_player.get_policy_mode()) if each_player.get_policy_mode() == "MODEL": model_dir = "./analysis-tools/models_ex/" #model_w_file = "model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925weights.h5" #model_json_file ="model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925in_json.json" # Second best #model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" #model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" # very good Best #model_w_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_weights.h5" #model_json_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_in_json.json" # third good #model_json_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_in_json.json" #model_w_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_weights.h5" ## LOOKS best so far.. waiting to be done model_json_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_in_json.json" model_w_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_weights.h5" # Done model_json_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_in_json.json" model_w_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_weights.h5" # model_json_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_in_json.json" model_w_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_weights.h5" #model_w_file ="model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_weights.h5" #model_json_file = "model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_in_json.json" # Done -- 1K weighted for preventing lose #model_w_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_weights.h5" #model_json_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_in_json.json" # Done model_w_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_weights.h5" model_json_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_in_json.json" # DOne -- below are from a buggy weighting scheme.. model_json_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_in_json.json" model_w_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_weights.h5" # WORST SO FAR model_w_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_weights.h5" model_json_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_in_json.json" # BEST SO FAR model_json_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_in_json.json" model_w_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_weights.h5" # DONE model_json_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_in_json.json" model_w_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_weights.h5" # DONE #model_json_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_in_json.json" #model_w_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_weights.h5" model_obj = each_player.get_model_obj() #self.model_based_policy = ModelPolicy(model_obj) #model_dir, model_w_file, model_json_file) break self.game_id = uuid.uuid1() def show_progress_on_canvas(self, a_boolean_flag): self.flag_for_drawing_canvas = a_boolean_flag def set_to_next_player(self): next_turn = (self.turn_id + 1) % len(self.players) self.turn_id = next_turn self.turn = self.players[self.turn_id] def is_end(self): if self.is_draw(): return True for each_player in self.players: if self.check_end_status(each_player): return True return False def check_end_status(self, a_player): if self.board.is_win(a_player): return True return False def get_input(self): prompt = "%s 's Turn\n" % (self.turn) input_from_user = input(prompt) r_c_in_list = input_from_user.split("_") r, c = r_c_in_list[0], r_c_in_list[1] r_int = ord(r) - ord('a') c_int = int(c) return r_int, c_int def validate_input(self): available_pos = self.board.get_available_positions() while True: r, c = self.get_input() if available_pos.get((r, c), 0) == 1: break print("Try again. Your input") return r, c # def convert_sequence_moves_to_vector(self): # individual_sequence = [0] * 9 # for item in self.board.sequences_of_movements: # turn_for_this_move = item.get("turn") # move_made_for_this_move = item.get("position") # individual_sequence[move_made_for_this_move - 1] = 1 if turn_for_this_move == "X" else 2 # # return np.array([individual_sequence]) def play_game(self): turn_id = 0 game_log = { 'game_uuid': self.get_game_id(), 'play_modes': { 'X': self.players[0].get_policy_mode(), 'O': self.players[1].get_policy_mode() }, 'board_size': self.board.row, 'winner': "", 'sequence': {} } canvas_for_drawing = None if self.flag_for_drawing_canvas: #turtle.setup(500, 500) canvas_for_drawing = Draw() is_draw_gametie = False """ # Below block must be gone # from model_loader import ModelBasedAgent # model_dir = "./analysis-tools/models_ex/" # model_w_file = model_dir + "current_best.h5" #"model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" # model_json_file = model_dir + "current_best.json" #model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" # model_agent_obj = ModelBasedAgent(model_w_file, model_json_file) # mlModel = model_agent_obj.get_model() """ while self.check_end_status(self.turn) != True: print(self.board) test_instance = self.board.convert_sequence_moves_to_vector() #print(test_instance) if self.turn.get_player_type() == Player.PTYPE_HUMAN: # TODO -- this part is just to make a simplified interface of modelbased movement # later, this will be the part of Policy as a ModelPolicy class # for now, we assume player O would be model.. as X is always starting first #test_instance = np.array([an_instance]) #prediction_move = mlModel.predict_proba(test_instance)[0] #pp = model_agent_obj.predict_proba(test_instance)[0] #UT.print_three_arrays(test_instance[0], pp, prediction_move) #move_by_prediction = np.argmax(pp) + 1 #r_e, c_e = self.board.indices_to_coordinate(move_by_prediction) #print("R:%d C:%d \t i_e:%d R_e:%d C_e:%d" % (r_v, c_v, move_by_prediction, r_e, c_e)) r_v, c_v = self.validate_input() else: # when Player is an agent if self.turn.get_policy_mode() == "MODEL": model_structure = 3 # 0 for regular, 1 for two tower, 2 for conv2d, 3 for conv2d+twoTowers r_v, c_v = self.turn.model_based_policy.move( self.board, model_structure) elif self.turn.get_policy_mode() == "MCTS": if self.turn.get_marker() == "O": r_v, c_v = self.mctsObj_O.move(self.board) # TODO -- this part is just to make a simplified interface of modelbased movement # This could be a place for ModelBased action elif self.turn.get_marker() == "X": if self.turn.get_policy_mode() == "RANDOM": self.random_policy = RandomPolicy() r_v, c_v = self.random_policy.move(self.board) # print("AM I HERE FOR RANDOM") else: r_v, c_v = self.mctsObj_X.move(self.board) elif self.turn.get_policy_mode() == "RANDOM": self.random_policy = RandomPolicy() r_v, c_v = self.random_policy.move(self.board) self.board.set_a_move(r_v, c_v, self.turn) UT.print_as_log(self.board.get_available_positions()) ## Drawing on canvas if self.flag_for_drawing_canvas: canvas_for_drawing.move_and_draw(r_v, c_v, self.turn.get_marker()) if self.check_end_status(self.turn): print("FinalResult: %s" % (self.turn.get_marker())) print(self.board) print(self.board.convert_sequence_moves_to_vector()) #UT.print_as_log("Winning and so ending this game") UT.print_as_log(self.board.sequences_of_movements) game_log['winner'] = self.turn.get_marker() game_log['sequence'] = self.board.sequences_of_movements break elif self.is_draw(): is_draw_gametie = True print("FinalResult: Draw") #UT.print_as_log("Draw.... so, exiting the game") print(self.board) print(self.board.convert_sequence_moves_to_vector()) game_log['winner'] = "D" game_log['sequence'] = self.board.sequences_of_movements break else: self.set_to_next_player() ## for writing a message to the canvas if self.flag_for_drawing_canvas: result_message = "Game result -- Winner is %s" % ( game_log.get("winner")) if is_draw_gametie: result_message = "Game result : Draw" canvas_for_drawing.write_text(result_message) canvas_for_drawing.exit_on_click() #canvas_for_drawing.reset_canvas() #turtle.TurtleScreen._RUNNING = True json_str = game_log #json.dumps(game_log) return json_str def a_move_for_agent(self): r, c = self.a_move_for_agent_helper() return r, c ## this is the function for an agent to come up with a smarter decision def a_move_for_agent_helper(self): all_available_positions_dict = self.board.get_available_positions() random_move_index = np.random.randint( 0, len(all_available_positions_dict), 1)[0] r, c = list(all_available_positions_dict.keys())[random_move_index] return r, c def is_draw(self): if len(self.board.get_available_positions()) < 1: return True return False @staticmethod def load_a_game(afile): move_sequences = UT.read_a_game(afile) if move_sequences: Game.parse_history(move_sequences) @staticmethod def parse_history(adict, message_str=None): winner = adict.get("winner", None) if winner == None: print("Something is wrong") sys.exit(1) move_sequences = adict.get("sequence", None) turtle.hideturtle() board_obj_from_history = Board(3, 3, 3) # below obj is for drawing the board on a canvas. # if you don't like, you can make it comment draw_board_obj = Draw() for each_move in move_sequences: player_marker = each_move.get("turn") r_index, c_index = each_move.get("xy") p = Player("test", player_marker, 1) board_obj_from_history.set_a_move(r_index, c_index, p) draw_board_obj.move_and_draw(r_index, c_index, player_marker) print(board_obj_from_history) draw_board_obj.write_text( ("Winner is: %s -- sampled %s" % (winner, str(message_str)))) time.sleep(3) draw_board_obj.turtle_obj.getpen().clear() draw_board_obj.turtle_obj.getscreen().clearscreen() # draw_board_obj.exit_on_click() # or def get_game_id(self): return str(self.game_id)
def __init__(self, players, turn_id, board): self.board = board ## to track of whose turn is now self.players = players self.turn_id = turn_id self.status = Game.GAME_STATUS[0] self.turn = self.players[self.turn_id] self.flag_for_drawing_canvas = False # in case a move needs to be made through Random self.random_policy = RandomPolicy() ## MCTSPolicy(a, b) -- a is player, b is for an opponent self.mctsObj_X = MCTSPolicy(self.players[0], self.players[1], board=self.board) self.mctsObj_O = MCTSPolicy(self.players[1], self.players[0], board=self.board) self.mctsObjs = [self.mctsObj_X, self.mctsObj_O] """ model_dir = "./analysis-tools/models_ex/" model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" """ self.model_based_policy = None # ModelPolicy(model_dir, model_w_file, model_json_file) for each_player in self.players: print(each_player.get_policy_mode()) if each_player.get_policy_mode() == "MODEL": model_dir = "./analysis-tools/models_ex/" #model_w_file = "model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925weights.h5" #model_json_file ="model_2020-01-09-17-23-04_BEST_SO_FAR_WITH_Early_Stop-0.730-upto2-0.925in_json.json" # Second best #model_w_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" #model_json_file = "model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" # very good Best #model_w_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_weights.h5" #model_json_file = "model_2020-01-11-11-16-48_win_sample_focus_0.875_in_json.json" # third good #model_json_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_in_json.json" #model_w_file = "model_2020-01-11-20-11-26_win_sample_focus_0.91_weights.h5" ## LOOKS best so far.. waiting to be done model_json_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_in_json.json" model_w_file = "model_2020-01-11-20-39-46_winAndLoss_sample_focus_0.71_weights.h5" # Done model_json_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_in_json.json" model_w_file = "model_2020-01-11-21-07-12_winAndLoss_sample_focus_0.75_weights.h5" # model_json_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_in_json.json" model_w_file = "model_2020-01-12-08-47-16_winAndLoss_sample_focus_0.749_weights.h5" #model_w_file ="model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_weights.h5" #model_json_file = "model_2020-01-12-18-59-53_winAndLoss_sample_focus_0.71_in_json.json" # Done -- 1K weighted for preventing lose #model_w_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_weights.h5" #model_json_file = "model_2020-01-12-19-29-34_winAndLoss_Loss1KWeights_sample_focus_0.70_in_json.json" # Done model_w_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_weights.h5" model_json_file = "model_2020-01-12-21-02-40_winAndLoss_sample_focus_0.733_in_json.json" # DOne -- below are from a buggy weighting scheme.. model_json_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_in_json.json" model_w_file = "model_2020-01-12-21-40-17_winAndLoss_combinedWithUniq_sample_focus_0.649_weights.h5" # WORST SO FAR model_w_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_weights.h5" model_json_file = "model_2020-01-13-07-27-36_winAndLoss_combinedWithUniq_sample_focus_0.41_in_json.json" # BEST SO FAR model_json_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_in_json.json" model_w_file = "model_2020-01-13-21-02-40_winAndLoss_Loss1KWeights_sample_focus_0.718_weights.h5" # DONE model_json_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_in_json.json" model_w_file = "model_2020-01-14-00-19-22_winAndLoss_combinedWithUniq_sample_focus_0.65_weights.h5" # DONE #model_json_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_in_json.json" #model_w_file = "model_2020-01-14-21-34-33_winAndLoss_withOneHotEncodeForLabel_sample_focus_0.7108_weights.h5" model_obj = each_player.get_model_obj() #self.model_based_policy = ModelPolicy(model_obj) #model_dir, model_w_file, model_json_file) break self.game_id = uuid.uuid1()
from utils.misc import sample_episode, sample_step from policies import SimpleBlackjackPolicy, RandomPolicy, create_epsilon_greedy_nchain_policy import matplotlib.pyplot as plt # Default variables alphas = [0.001] td_n = 1 actions = [0, 1] n = 5 env = NChainEnv(n=n, slip=0.0) # Global settings target_policy = create_epsilon_greedy_nchain_policy(n, 0.001) behavior_policy = RandomPolicy(actions) n_experiments = 10 save_every = 1e3 ### How often we should save the results # Conf for mc n_mc_run = int(3e5) save_every_mc = n_mc_run # Conf for the mc off policy n_mc_off_policy = int(3e5) ### Here we create the names name = "{}Chain".format(n)
def play_game(self): turn_id = 0 game_log = { 'game_uuid': self.get_game_id(), 'play_modes': { 'X': self.players[0].get_policy_mode(), 'O': self.players[1].get_policy_mode() }, 'board_size': self.board.row, 'winner': "", 'sequence': {} } canvas_for_drawing = None if self.flag_for_drawing_canvas: #turtle.setup(500, 500) canvas_for_drawing = Draw() is_draw_gametie = False """ # Below block must be gone # from model_loader import ModelBasedAgent # model_dir = "./analysis-tools/models_ex/" # model_w_file = model_dir + "current_best.h5" #"model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924weights.h5" # model_json_file = model_dir + "current_best.json" #model_2020-01-09-15-15-06_BEST_SO_FAR_WITH_Early_Stop-0.90-upto2-0.924in_json.json" # model_agent_obj = ModelBasedAgent(model_w_file, model_json_file) # mlModel = model_agent_obj.get_model() """ while self.check_end_status(self.turn) != True: print(self.board) test_instance = self.board.convert_sequence_moves_to_vector() #print(test_instance) if self.turn.get_player_type() == Player.PTYPE_HUMAN: # TODO -- this part is just to make a simplified interface of modelbased movement # later, this will be the part of Policy as a ModelPolicy class # for now, we assume player O would be model.. as X is always starting first #test_instance = np.array([an_instance]) #prediction_move = mlModel.predict_proba(test_instance)[0] #pp = model_agent_obj.predict_proba(test_instance)[0] #UT.print_three_arrays(test_instance[0], pp, prediction_move) #move_by_prediction = np.argmax(pp) + 1 #r_e, c_e = self.board.indices_to_coordinate(move_by_prediction) #print("R:%d C:%d \t i_e:%d R_e:%d C_e:%d" % (r_v, c_v, move_by_prediction, r_e, c_e)) r_v, c_v = self.validate_input() else: # when Player is an agent if self.turn.get_policy_mode() == "MODEL": model_structure = 3 # 0 for regular, 1 for two tower, 2 for conv2d, 3 for conv2d+twoTowers r_v, c_v = self.turn.model_based_policy.move( self.board, model_structure) elif self.turn.get_policy_mode() == "MCTS": if self.turn.get_marker() == "O": r_v, c_v = self.mctsObj_O.move(self.board) # TODO -- this part is just to make a simplified interface of modelbased movement # This could be a place for ModelBased action elif self.turn.get_marker() == "X": if self.turn.get_policy_mode() == "RANDOM": self.random_policy = RandomPolicy() r_v, c_v = self.random_policy.move(self.board) # print("AM I HERE FOR RANDOM") else: r_v, c_v = self.mctsObj_X.move(self.board) elif self.turn.get_policy_mode() == "RANDOM": self.random_policy = RandomPolicy() r_v, c_v = self.random_policy.move(self.board) self.board.set_a_move(r_v, c_v, self.turn) UT.print_as_log(self.board.get_available_positions()) ## Drawing on canvas if self.flag_for_drawing_canvas: canvas_for_drawing.move_and_draw(r_v, c_v, self.turn.get_marker()) if self.check_end_status(self.turn): print("FinalResult: %s" % (self.turn.get_marker())) print(self.board) print(self.board.convert_sequence_moves_to_vector()) #UT.print_as_log("Winning and so ending this game") UT.print_as_log(self.board.sequences_of_movements) game_log['winner'] = self.turn.get_marker() game_log['sequence'] = self.board.sequences_of_movements break elif self.is_draw(): is_draw_gametie = True print("FinalResult: Draw") #UT.print_as_log("Draw.... so, exiting the game") print(self.board) print(self.board.convert_sequence_moves_to_vector()) game_log['winner'] = "D" game_log['sequence'] = self.board.sequences_of_movements break else: self.set_to_next_player() ## for writing a message to the canvas if self.flag_for_drawing_canvas: result_message = "Game result -- Winner is %s" % ( game_log.get("winner")) if is_draw_gametie: result_message = "Game result : Draw" canvas_for_drawing.write_text(result_message) canvas_for_drawing.exit_on_click() #canvas_for_drawing.reset_canvas() #turtle.TurtleScreen._RUNNING = True json_str = game_log #json.dumps(game_log) return json_str
""" Plays many games and then plots the cumulative win rates of the players. The player policies can be chosen from MCTS and Random. """ from gameplay import play_game from policies import RandomPolicy, MCTSPolicy from visualization import visualize_mcts_tree import networkx as nx import numpy as np # Choose the player policies here: MCTS_vs_Random = [MCTSPolicy(player='X'), RandomPolicy()] Random_vs_MCTS = [RandomPolicy(), MCTSPolicy(player='O')] MCTS_vs_MCTS = [MCTSPolicy(player='X'), MCTSPolicy(player='O')] Random_vs_Random = [RandomPolicy(), RandomPolicy()] experiments = [[MCTSPolicy(player='X'), RandomPolicy()], [MCTSPolicy(player='X'), RandomPolicy()], [RandomPolicy(), MCTSPolicy(player='O')], [RandomPolicy(), MCTSPolicy(player='O')], [MCTSPolicy(player='X'), MCTSPolicy(player='O')], [MCTSPolicy(player='X'), MCTSPolicy(player='O')], [RandomPolicy(), RandomPolicy()], [RandomPolicy(), RandomPolicy()]] names = [ 'x_mcts_vs_o_random_1', 'x_mcts_vs_o_random_2', 'x_random_vs_o_mcts_1',
def main(args, logdir): """ Model Based Reinforcement Learning 1) Generate random trajectories 2) Train the model on the generated data 3) For each repetition: a) Generate new data using the MPC controller b) Retrain the model using the new data and the old data c) (Optional) Compute Mean Prediction Error """ # SETUP train_envs = [] test_envs = [] if args.no_sunblaze: train_env = gym.make(args.env_name) test_env = gym.make(args.env_name) if 'PyBullet' in args.env_name and args.render: train_env.render() train_env.reset() elif args.test_type == 'interpolation': train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) elif args.test_type == 'extrapolation': train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomExtreme-v0')) test_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + 'RandomNormal-v0')) else: train_envs.append( sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) test_envs.append(sunblaze_envs.make('Sunblaze' + args.env_name + '-v0')) test_cnt = 0 for train_env in train_envs: assert isinstance(train_env.observation_space, gym.spaces.Box) start_time = time.time() logger = Logger(logdir) is_discrete = isinstance(train_env.action_space, gym.spaces.Discrete) ob_dim = train_env.observation_space.shape[0] ac_dim = train_env.action_space.n if is_discrete else train_env.action_space.shape[ 0] reward_function = get_reward_function(train_env) train_env.reset() ensemble = Ensemble(ob_dim, ac_dim, is_discrete, args.pnn, args.ensemble_size, args.lr, args.hidden_size, device=nn_utils.DEVICE) # TRAIN # Instantiate policies mpc_policy = MPCPolicy(args, train_env, ensemble, reward_function, nn_utils.DEVICE) random_policy = RandomPolicy(train_env) # Instantiate Data generator data_generator = DataGenerator(args, train_env, nn_utils.DEVICE, mpc_policy, random_policy, max_size=args.max_memory_size) if args.weights_paths is not None: # If weights are given, visualize and quit ensemble.load_weights(args.weights_paths) current_episodes, rewards, lengths = data_generator.generate_closed_loop_data( args.render) if args.mpe: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='Ensemble %s' % (args.weights_paths)) print('avg reward episode %f' % (np.mean(rewards))) print('avg len %f' % (np.mean([len(ep) for ep in current_episodes]))) return # Otherwise train model on random trajectories current_episodes, train_rewards, train_lengths = data_generator.generate_random_data( ) # Train initial model using random trajectories train_loss, test_loss = ensemble.train_net( args.epochs_rand, args.batch_size, data_generator, samples_per_model=args.samples_per_model) if args.mpe: print('Computing MPE') for (i, model) in enumerate(ensemble.models): MPE(train_env, current_episodes, model, args.mpc_horizon, label='random data, model %d' % (i)) if len(ensemble.models) > 1: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='random data, ensemble') _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data( render=args.render) # TODO: keep test data only for test data for itr in range(args.repetitions): print('\nMPC Repetition %d / %d \n' % (itr + 1, args.repetitions)) epsilon = mpc_policy.update_epsilon(itr) perform_logging(itr, logger, eval_rewards, train_rewards, test_loss, train_loss, eval_lengths, train_lengths, start_time, epsilon) current_episodes, train_rewards, train_lengths = data_generator.generate_closed_loop_data( ) train_loss, test_loss = ensemble.train_net( args.epochs_rl, args.batch_size, data_generator, samples_per_model=args.samples_per_model) if args.mpe: print('Computing MPE') for (i, model) in enumerate(ensemble.models): MPE(train_env, current_episodes, model, args.mpc_horizon, label='rep %d, model %d' % (itr, i)) if len(ensemble.models) > 1: MPE(train_env, current_episodes, ensemble, args.mpc_horizon, label='rep %d, ensemble' % (itr)) _, eval_rewards, eval_lengths = data_generator.generate_evaluation_data( render=args.render) if args.save_model: for (i, model) in enumerate(ensemble.models): save_file = '%s/models/rep_%d_model_%d_%.4f.pt' % ( str(logdir), itr, i, test_loss[i][-1]) torch.save(model.state_dict(), save_file) # SUNBLAZE TEST for test_env in test_envs: test_name = test_env.unwrapped.spec.id train_name = train_env.unwrapped.spec.id if test_cnt < 3: print('\nTESTING: ' + train_name + ' on ' + test_name, flush=True) success_function = get_success_function(test_env) num_success = 0 rewards = [] for ep_num in range(args.test_episodes): success, ep_reward = run_test_episode( test_env, mpc_policy, success_function, args.render) rewards.append(ep_reward) num_success += int(success) print( 'Test episode: %2d / %2d \t Success: %d \t Reward: %d' % (ep_num + 1, args.test_episodes, int(success), ep_reward), flush=True) score = num_success / args.test_episodes * 100 logger.log_scalar(score, test_name + '-' + train_name, 0) with open(train_name + '_' + test_name + '_score.txt', 'w+') as f: f.write('Score for ' + train_name + ' tested on ' + test_name + ': ' + str(score)) print('\nScore for ' + train_name + ' tested on ' + test_name + ' testing: ', score, flush=True) test_cnt += 1
""" Generates sample figures visualizing game trees. First, it generates one game graph and saves it to 'game_graph.png' Then, it plays multiple games, and composes their graphs and saves it to 'multiple_game_graph.png' Requires the NetworkX graph package and GraphViz, which are included in Anaconda """ from gameplay import play_game from policies import RandomPolicy import networkx as nx player_policies = [RandomPolicy(), RandomPolicy()] G = play_game(player_policies) dot_graph = nx.to_pydot(G) dot_graph.set_graph_defaults(fontname='Courier') dot_graph.write_png('game_graph.png') games = [] for i in range(30): games.append(play_game(player_policies)) dot_graph_combined = nx.compose_all(games) dot_graph = nx.to_pydot(dot_graph_combined) dot_graph.set_graph_defaults(fontname='Courier') dot_graph.write_png('multiple_game_graph.png')
import numpy as np from tqdm import trange from policies import RandomPolicy _ENVS = ['CartPole-v1', 'MountainCar-v0', 'Acrobot-v1', 'MountainCarContinuous-v0', 'Pendulum-v0'] if __name__ == "__main__": logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--env_name', '-e', type=str, default='CartPole-v1', choices=_ENVS) args = parser.parse_args() logging.info('Making env {}'.format(args.env_name)) env = gym.make(args.env_name) pi = RandomPolicy(env.observation_space, env.action_space) all_obs = [] for _ in trange(100): obs = env.reset() all_obs.append(obs) while True: obs, _, done, _ = env.step(pi(obs)) all_obs.append(obs) if done: break std_obs = np.std(all_obs, axis=0) print(repr(std_obs)) import IPython; IPython.embed(); exit(0)
def set_policies(policies_name, user_segment, user_features, n_playlists): # Please see section 3.3 of RecSys paper for a description of policies POLICIES_SETTINGS = { 'random': RandomPolicy(n_playlists), 'etc-seg-explore': ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n=100, cascade_model=True), 'etc-seg-exploit': ExploreThenCommitSegmentPolicy(user_segment, n_playlists, min_n=20, cascade_model=True), 'epsilon-greedy-explore': EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon=0.1, cascade_model=True), 'epsilon-greedy-exploit': EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon=0.01, cascade_model=True), 'kl-ucb-seg': KLUCBSegmentPolicy(user_segment, n_playlists, cascade_model=True), 'ts-seg-naive': TSSegmentPolicy(user_segment, n_playlists, alpha_zero=1, beta_zero=1, cascade_model=True), 'ts-seg-pessimistic': TSSegmentPolicy(user_segment, n_playlists, alpha_zero=1, beta_zero=99, cascade_model=True), 'ts-lin-naive': LinearTSPolicy(user_features, n_playlists, bias=0.0, cascade_model=True), 'ts-lin-pessimistic': LinearTSPolicy(user_features, n_playlists, bias=-5.0, cascade_model=True), # Versions of epsilon-greedy-explore and ts-seg-pessimistic WITHOUT cascade model 'epsilon-greedy-explore-no-cascade': EpsilonGreedySegmentPolicy(user_segment, n_playlists, epsilon=0.1, cascade_model=False), 'ts-seg-pessimistic-no-cascade': TSSegmentPolicy(user_segment, n_playlists, alpha_zero=1, beta_zero=99, cascade_model=False) } return [POLICIES_SETTINGS[name] for name in policies_name]