def __init__(self, simulation_mode=False): """:cvar """ env = get_environment("blobwar")() env.seed(10) self.images = {1: "blob_blue", 2: "blob_orange"} self.images_selected = { 1: "blob_blue_selected", 2: "blob_orange_selected" } self.images_candidates = { 1: "blob_blue_candidate", 2: "blob_orange_candidate" } self.env = env self.human_player = Human() try: self.ai_player_1 = PPO(self.env, size=self.env.core.board.shape[0]) except Exception as e: print("PPO not yet trained for : ", self.env.core.board.shape[0]) self.ai_player_1 = GreedyPlayer() self.ai_player_2 = GreedyPlayer() self.player1 = self.player2 = None if simulation_mode: self.state = GameState.WAITING_FIRST_CLICK self.player1 = self.ai_player_1 self.player2 = self.player1 return self.game = Board(self.env.core.board.shape[0], self.env.core.board.shape[1]) self.game.cell_size = 70 self.game.cell_spacing = 1 self.game.fill(None) self.game.margin_color = self.game.grid_color = "wheat4" self.game.cell_color = "white" self.game.title = "Blobwar" self.game.create_output(font_size=11) self.state = GameState.GAME_OVER self.move_start_case = None self.move_candidates_case = []
def main(args): logger.configure(config.LOGDIR) if args.debug: logger.set_level(config.DEBUG) else: logger.set_level(config.INFO) # make environment env = get_environment(args.env_name)(verbose=args.verbose, manual=args.manual) env.seed(args.seed) total_rewards = {} first_time = True if args.recommend: ppo_model = load_model(env, 'best_model.zip') ppo_agent = Agent('best_model', ppo_model) else: ppo_agent = None agents = [] # load the agents if len(args.agents) != env.n_players: raise Exception( f'{len(args.agents)} players specified but this is a {env.n_players} player game!' ) for i, agent in enumerate(args.agents): if agent == 'human': agent_obj = Agent('human') elif agent == 'rules': agent_obj = Agent('rules') elif agent == 'json': # Start mq server context = zmq.Context() socket = context.socket(zmq.REP) socket.bind("tcp://*:5555") logger.debug("zaq server start at 5555") agent_obj = Agent('json') elif agent == 'base': base_model = load_model(env, 'base.zip') agent_obj = Agent('base', base_model) else: ppo_model = load_model(env, f'{agent}.zip') agent_obj = Agent(agent, ppo_model) agents.append(agent_obj) total_rewards[agent_obj.id] = 0 # play games logger.info(f'\nPlaying {args.games} games...') for game in range(args.games): players = agents[:] if args.randomise_players: random.shuffle(players) obs = env.reset() done = False for i, p in enumerate(players): logger.debug(f'Player {i+1} = {p.name}') while not done: current_player = players[env.current_player_num] env.render() logger.debug(f'\nCurrent player name: {current_player.name}') if args.recommend and current_player.name in [ 'human', 'rules', 'json' ]: # show recommendation from last loaded model logger.debug(f'\nRecommendation by {ppo_agent.name}:') action = ppo_agent.choose_action(env, choose_best_action=True, mask_invalid_actions=True) if current_player.name == 'human': action = input('\nPlease choose an action: ') try: # for int actions action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) if current_player.name == 'json': if (not first_time): game_state = { "legal_action": [i for i, o in enumerate(env.legal_actions) if o != 0], "tableCard": env.tableCard.id } socket.send_json(game_state) action = socket.recv_json() first_time = False logger.debug(f'\nReceived {action}') # action = input('\n JSON!!! Please choose an action: ') try: # for int actions action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) elif current_player.name == 'rules': logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=False, mask_invalid_actions=True) else: logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action( env, choose_best_action=args.best, mask_invalid_actions=True) obs, reward, done, _ = env.step(action) for r, player in zip(reward, players): total_rewards[player.id] += r player.points += r if args.cont: input('Press any key to continue') env.render() logger.info(f"Played {game + 1} games: {total_rewards}") if args.write_results: write_results(players, game, args.games, env.turns_taken) for p in players: p.points = 0 env.close()
def main(args): rank = MPI.COMM_WORLD.Get_rank() model_dir = os.path.join(config.MODELDIR, args.env_name) if rank == 0: try: os.makedirs(model_dir) except: pass if args.reset: reset_files(model_dir) logger.configure(config.LOGDIR) else: logger.configure(format_strs=[]) if args.debug: logger.set_level(config.DEBUG) else: time.sleep(5) logger.set_level(config.INFO) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) logger.info('\nSetting up the selfplay training environment opponents...') base_env = get_environment(args.env_name) env = selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose) env.seed(workerseed) CustomPolicy = get_network_arch(args.env_name) params = { 'gamma': args.gamma, 'timesteps_per_actorbatch': args.timesteps_per_actorbatch, 'clip_param': args.clip_param, 'entcoeff': args.entcoeff, 'optim_epochs': args.optim_epochs, 'optim_stepsize': args.optim_stepsize, 'optim_batchsize': args.optim_batchsize, 'lam': args.lam, 'adam_epsilon': args.adam_epsilon, 'schedule': 'linear', 'verbose': 1, 'tensorboard_log': config.LOGDIR } time.sleep( 5 ) # allow time for the base model to be saved out when the environment is created if args.reset or not os.path.exists( os.path.join(model_dir, 'best_model.zip')): logger.info('\nLoading the base PPO agent to train...') model = PPO1.load(os.path.join(model_dir, 'base.zip'), env, **params) else: logger.info( '\nLoading the best_model.zip PPO agent to continue training...') model = PPO1.load(os.path.join(model_dir, 'best_model.zip'), env, **params) #Callbacks logger.info( '\nSetting up the selfplay evaluation environment opponents...') callback_args = { 'eval_env': selfplay_wrapper(base_env)(opponent_type=args.opponent_type, verbose=args.verbose), 'best_model_save_path': config.TMPMODELDIR, 'log_path': config.LOGDIR, 'eval_freq': args.eval_freq, 'n_eval_episodes': args.n_eval_episodes, 'deterministic': False, 'render': True, 'verbose': 0 } if args.rules: logger.info( '\nSetting up the evaluation environment against the rules-based agent...' ) # Evaluate against a 'rules' agent as well eval_actual_callback = EvalCallback( eval_env=selfplay_wrapper(base_env)(opponent_type='rules', verbose=args.verbose), eval_freq=1, n_eval_episodes=args.n_eval_episodes, deterministic=args.best, render=True, verbose=0) callback_args['callback_on_new_best'] = eval_actual_callback # Evaluate the agent against previous versions eval_callback = SelfPlayCallback(args.opponent_type, args.threshold, args.env_name, **callback_args) logger.info('\nSetup complete - commencing learning...\n') model.learn(total_timesteps=int(1e9), callback=[eval_callback], reset_num_timesteps=False, tb_log_name="tb") env.close() del env
def main(args): logger.configure(config.LOGDIR) if args.debug: logger.set_level(config.DEBUG) else: logger.set_level(config.INFO) #make environment env = get_environment(args.env_name)(verbose = args.verbose, manual = args.manual) env.seed(args.seed) total_rewards = {} if args.recommend: ppo_model = load_model(env, 'best_model.zip') ppo_agent = Agent('best_model', ppo_model) else: ppo_agent = None agents = [] #load the agents if len(args.agents) != env.n_players: raise Exception(f'{len(args.agents)} players specified but this is a {env.n_players} player game!') for i, agent in enumerate(args.agents): if agent == 'human': agent_obj = Agent('human') elif agent== 'greedy': agent_obj = Agent('greedy') elif agent == 'rules': agent_obj = Agent('rules') elif agent == 'base': base_model = load_model(env, 'base.zip') agent_obj = Agent('base', base_model) else: ppo_model = load_model(env, f'{agent}.zip') agent_obj = Agent(agent, ppo_model) agents.append(agent_obj) total_rewards[agent_obj.id] = 0 if args.env_name == "blobwar": human_blobwar = Human() #play games logger.info(f'\nPlaying {args.games} games...') for game in range(args.games): players = agents[:] if args.randomise_players: random.shuffle(players) obs = env.reset() done = False for i, p in enumerate(players): logger.debug(f'Player {i+1} = {p.name}') while not done: current_player = players[env.current_player_num] env.render() logger.debug(f'Current player name: {current_player.name}') if args.recommend and current_player.name in ['human', 'rules']: # show recommendation from last loaded model logger.debug(f'\nRecommendation by {ppo_agent.name}:') action = ppo_agent.choose_action(env, choose_best_action = True, mask_invalid_actions = True) if current_player.name == 'human': if args.env_name == "blobwar": move= human_blobwar.compute_next_move(env.core) action=env.encode_action(move) else: action = input('\nPlease choose an action: ') try: action = int(action) except: # for MulitDiscrete action input as list TODO action = eval(action) elif current_player.name == 'rules': logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action(env, choose_best_action = False, mask_invalid_actions = True) else: logger.debug(f'\n{current_player.name} model choices') action = current_player.choose_action(env, choose_best_action = args.best, mask_invalid_actions = True) obs, reward, done, _ = env.step(action) for r, player in zip(reward, players): total_rewards[player.id] += r player.points += r if args.cont: input('Press any key to continue') env.render() logger.info(f"Played {game + 1} games: {total_rewards}") if args.write_results: write_results(players, game, args.games, env.turns_taken) for p in players: p.points = 0 env.close()