def makeTrainingObservation(): env = Pomme(**config["env_kwargs"]) agents = {} for agent_id in range(num_players): agent = TrainingAgent(config["agent"](agent_id, config["game_type"])) agents[agent_id] = agent env.set_agents(list(agents.values())) env.set_init_game_state(None) return env
def env_for_players(self): config = ffa_v0_fast_env(30) env = Pomme(**config["env_kwargs"]) agents = [DQN(config["agent"](0, config["game_type"])), PlayerAgent(config["agent"](1, config["game_type"])), RandomAgent(config["agent"](2, config["game_type"])), RandomAgent(config["agent"](3, config["game_type"]))] env.set_agents(agents) env.set_training_agent(agents[0].agent_id) # training_agent is only dqn agent env.set_init_game_state(None) return env
def _thunk(): env = Pomme(**config["env_kwargs"]) agents = {} for agent_id in range(num_players): agent = TrainingAgent(config["agent"](agent_id, config["game_type"])) agents[agent_id] = agent simple_Agent_id = num_players agents[simple_Agent_id] = SimpleAgent(config["agent"]( simple_Agent_id, config["game_type"])) env.set_agents(list(agents.values())) env.set_init_game_state(None) return env
def make_env(self, config): # Instantiate the environment env = Pomme(**config["env_kwargs"]) # Add agents agents = [] for agent_id in range(NUM_AGENTS): if agent_id == self.agent_id: agents.append(self) else: agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_init_game_state(None) return env
def set_pommerman_env(agent_id=0): # Instantiate the environment config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) np.random.seed(0) env.seed(0) # Add 3 Simple Agents and 1 DQN agent agents = [ DQN(config["agent"](agent_id, config["game_type"])) if i == agent_id else SimpleAgent(config["agent"](i, config["game_type"])) for i in range(4) ] env.set_agents(agents) env.set_training_agent( agents[agent_id].agent_id) # training_agent is only dqn agent env.set_init_game_state(None) return env
def get_env(): config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) agent_id = 0 agents = [ DQN(config["agent"](0, config["game_type"])), SimpleAgent(config["agent"](1, config["game_type"])), SimpleAgent(config["agent"](2, config["game_type"])), SimpleAgent(config["agent"](3, config["game_type"])), ] env.set_agents(agents) env.set_training_agent(agents[agent_id].agent_id) env.set_init_game_state(None) return env
def main(): # Print all possible environments in the Pommerman registry print(pommerman.registry) config = ffa_v1_env() env = Pomme(**config["env_kwargs"]) # Add 3 agents agents = {} for agent_id in range(4): agents[agent_id] = SimpleAgent(config["agent"](agent_id, config["game_type"])) # agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows") env.set_agents(list(agents.values())) env.set_init_game_state(None) demo = [] # Run the episodes just like OpenAI Gym for i_episode in range(1): state = env.reset() done = False demo.append(env.get_json_info()) while not done: env.render() actions = env.act(state) state, reward, done, info = env.step(actions) demo.append(env.get_json_info()) if 1 in reward: winner = reward.index(1) else: winner = None print('Episode {} finished'.format(i_episode)) env.close() # If game not tied, save demonstration if winner is not None: demonstration = {'demo': demo, 'winner': winner} pickle.dump(demonstration, open("demonstration.p", "wb"))
class MultiAgent(MultiAgentEnv): def __init__(self): super(MultiAgent, self).__init__() self.phase = 0 self.setup() def setup(self): agents = [] if self.phase == 0: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 20 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 1: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 2: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert(0, NoDoAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 3: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 2 config["env_kwargs"]["num_items"] = 2 config["env_kwargs"]["num_rigid"] = 36 agents.insert(0, NoDoAgent(config["agent"](0, config["game_type"]))) agents.insert(2, NoDoAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() if self.phase == 4: self.agents_index = [1, 3] self.enemies_agents_index = [0, 2] config = team_v0_fast_env() config["env_kwargs"]["num_wood"] = 0 config["env_kwargs"]["num_items"] = 10 config["env_kwargs"]["num_rigid"] = 36 agents.insert( 0, SuicidalAgent(config["agent"](0, config["game_type"]))) agents.insert(2, SimpleAgent(config["agent"](2, config["game_type"]))) print(config["env_kwargs"]) self.env = Pomme(**config["env_kwargs"]) self.env.seed() for agent_id in self.agents_index: agents.insert( agent_id, BaseLineAgent(config["agent"](agent_id, config["game_type"]))) self.env.set_agents(agents) self.env.set_init_game_state(None) self.observation_space = spaces.Dict({ "boards": spaces.Box(low=-1, high=20, shape=(3, 11, 11)), "states": spaces.Box(low=-1, high=20, shape=(9, )), }) spaces.Box(low=-1.0, high=20.0, shape=(372, ), dtype=np.float32) self.action_space = self.env.action_space def set_phase(self, phase): print("learn phase " + str(phase)) self.phase = phase self.setup() self.reset() def step(self, actions): obs = self.env.get_observations() all_actions = self.env.act(obs) for index in self.agents_index: try: action = actions[index] except: action = 0 all_actions[index] = action step_obs = self.env.step(all_actions) obs, rew, done, info = {}, {}, {}, {} for i in actions.keys(): obs[i], rew[i], done[i], info[i] = [ featurize(step_obs[0][i]), step_obs[1][i], step_obs[1][i] == -1 or step_obs[2], step_obs[3], ] done["__all__"] = step_obs[2] return obs, rew, done, info def reset(self): obs = self.env.reset() return {i: featurize(obs[i]) for i in self.agents_index}
class MultiAgend(MultiAgentEnv): def __init__(self): super(MultiAgend, self).__init__() self.phase = 0 self.next_phase = 0 self.steps = 0 self.last_featurize_obs = None self.setup() def featurize(self, obs): enemies = [] for agent_id in self.enemies_agents_index: if agent_id == 0: enemies.append(Item.Agent0) if agent_id == 1: enemies.append(Item.Agent1) if agent_id == 2: enemies.append(Item.Agent2) if agent_id == 3: enemies.append(Item.Agent3) for enemie in obs["enemies"]: if enemie not in enemies: obs["board"] = ma.masked_equal( obs["board"], enemie.value).filled(fill_value=0) board = np.copy(obs["board"]) board[obs["position"][0], obs["position"][1]] = 0.0 enemie_pos = np.full((11, 11), 0) for enemie in obs["enemies"]: enemie_pos = enemie_pos | ma.masked_not_equal( board, enemie.value).filled(fill_value=0) board = ma.masked_equal(board, enemie.value).filled(fill_value=0) wood = ma.masked_not_equal(board, 2).filled(fill_value=0) wood = (wood > 0).astype(np.float32) board = ma.masked_equal(board, 2).filled(fill_value=0) stone = ma.masked_not_equal(board, 1).filled(fill_value=0) stone = (stone > 0).astype(np.float32) board = ma.masked_equal(board, 1).filled(fill_value=0) enemie_pos = (enemie_pos > 0).astype(np.float32) board = ma.masked_equal(board, obs["teammate"].value).filled(fill_value=0) flames = ma.masked_not_equal(board, 4).filled(fill_value=0) flames = (flames > 0).astype(np.float32) board = ma.masked_equal(board, 4).filled(fill_value=0) board = ma.masked_equal(board, 3).filled(fill_value=0) teammate_pos = ma.masked_not_equal( board, obs["teammate"].value).filled(fill_value=0) teammate_pos = (teammate_pos > 0).astype(np.float32) board = ma.masked_equal(board, obs["teammate"].value).filled(fill_value=0) items = board.astype(np.float32) pos = np.full((11, 11), 0) pos[obs["position"][0], obs["position"][1]] = 1.0 pos = pos.astype(np.float32) bomb_life = obs["bomb_life"].astype(np.float32) bomb_blast_strength = obs["bomb_blast_strength"].astype(np.float32) ammo = utility.make_np_float([obs["ammo"]]) blast_strength = utility.make_np_float([obs["blast_strength"]]) can_kick = utility.make_np_float([obs["can_kick"]]) game_end = utility.make_np_float([ (self.max_steps - self.steps) / self.max_steps ]) actual_featurize_obs = { 'boards': np.stack([ enemie_pos, pos, wood, stone, items, flames, teammate_pos, bomb_life, bomb_blast_strength ], axis=0), 'states': np.concatenate([ammo, blast_strength, can_kick, game_end]), } if self.last_featurize_obs == None: featurize_obs = { 'boards': np.concatenate([ actual_featurize_obs['boards'], actual_featurize_obs['boards'] ], axis=0), 'states': np.concatenate([ actual_featurize_obs['states'], actual_featurize_obs['states'] ]), } else: featurize_obs = { 'boards': np.concatenate([ self.last_featurize_obs['boards'], actual_featurize_obs['boards'] ], axis=0), 'states': np.concatenate([ self.last_featurize_obs['states'], actual_featurize_obs['states'] ]), } self.last_featurize_obs = actual_featurize_obs return featurize_obs def setup(self): agents = [] if self.phase == 0: arr = [0, 1] random.shuffle(arr) agents_index = arr.pop() op_index = arr.pop() self.agents_index = [agents_index] self.enemies_agents_index = [op_index] self.max_steps = 200 config = ffa_v0_fast_env() config["env_kwargs"]["max_steps"] = self.max_steps agents.insert( agents_index, BaseLineAgent(config["agent"](agents_index, config["game_type"]))) agents.insert( op_index, NoDoAgent(config["agent"](op_index, config["game_type"]))) self.env = Pomme(**config["env_kwargs"]) self.env.set_agents(agents) init_state = { 'board_size': '11', 'step_count': '0', 'board': '', 'agents': '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]', 'bombs': '[]', 'flames': '[]', 'items': '[]', 'intended_actions': '[0, 0]' } board = np.full((11, 11), 0) init_state['board'] = json.dumps(board.tolist()) agents_json = json.loads(copy.copy(init_state['agents'])) random_pos = np.random.choice(board.shape[0], (2, 2), replace=False) agents_json[0]["position"] = random_pos[0].tolist() agents_json[1]["position"] = random_pos[1].tolist() init_state['agents'] = json.dumps(agents_json) self.env._init_game_state = init_state self.env.reset() if self.phase == 1: arr = [0, 1] random.shuffle(arr) agents_index = arr.pop() op_index = arr.pop() self.agents_index = [agents_index] self.enemies_agents_index = [op_index] self.max_steps = 200 config = ffa_v0_fast_env() config["env_kwargs"]["max_steps"] = self.max_steps agents.insert( agents_index, BaseLineAgent(config["agent"](agents_index, config["game_type"]))) agents.insert( op_index, NoDoAgent(config["agent"](op_index, config["game_type"]))) self.env = Pomme(**config["env_kwargs"]) self.env.set_agents(agents) init_state = { 'board_size': '11', 'step_count': '0', 'board': '', 'agents': '[{"agent_id": 0, "is_alive": true, "position": [1, 1], "ammo": 1, "blast_strength": 2, "can_kick": false}, {"agent_id": 1, "is_alive": true, "position": [9, 0], "ammo": 1, "blast_strength": 2, "can_kick": false}]', 'bombs': '[]', 'flames': '[]', 'items': '[]', 'intended_actions': '[0, 0]' } board = np.full((11, 11), 0) board[5, :] = (np.ones(11) * 2) agents_json = json.loads(copy.copy(init_state['agents'])) agents_json[0]["position"] = [ random.randint(0, 4), random.randint(0, 10) ] agents_json[1]["position"] = [ random.randint(6, 10), random.randint(0, 10) ] init_state['agents'] = json.dumps(agents_json) init_state['board'] = json.dumps(board.tolist()) self.env._init_game_state = init_state self.env.reset() self.observation_space = spaces.Dict({ 'boards': spaces.Box(low=-1, high=25, shape=(11, 11, 18), dtype=np.float32), 'states': spaces.Box(low=-1, high=25, shape=(8, ), dtype=np.float32) }) self.action_space = self.env.action_space def set_phase(self, phase): print("learn phase " + str(phase)) self.next_phase = phase def close(self): self.env.close() def step(self, actions): self.steps = self.steps + 1 obs = self.env.get_observations() all_actions = self.env.act(obs) assert (len(all_actions) == len(self.agents_index) + len(self.enemies_agents_index)) for index in self.agents_index: try: action = actions[index] except: action = 0 assert (all_actions[index] == None) all_actions[index] = action step_obs = self.env.step(all_actions) obs, rew, done, info = {}, {}, {}, {} for i in actions.keys(): obs[i], rew[i], done[i], info[i] = [ self.featurize(step_obs[0][i]), step_obs[1][i], step_obs[1][i] == -1 or step_obs[2], step_obs[3] ] done["__all__"] = step_obs[2] return obs, rew, done, info def reset(self): self.steps = 0 self.phase = self.next_phase self.setup() obs = self.env.get_observations() return {i: self.featurize(obs[i]) for i in self.agents_index}
for agent_id in range(4): if agent_id == agent_pos: # agents.append(Cnn12833Dense1281(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]), # save_path=model_path)) # agents.append(Dense82(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]), # save_path=model_path2)) # agents.append(Dense128(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]), # save_path=model_path3)) # agents.append(Dense128(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]), # save_path='./dqn/model/ddgp_dense_128_1_rs/model.h4')) agents.append(Cnn12832Dense1281(env.action_space.n, BOARD_SIZE, character=config["agent"](agent_id, config["game_type"]), save_path='./dqn/model/ddgp_cnn128_3_2_dense_128_1_rs/model.h4')) else: agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_init_game_state(None) # Seed and reset the environment env.seed(0) obs = env.reset() # Run the agents until we're done done = False while not done: env.render() actions = env.act(obs) obs, reward, done, info = env.step(actions) env.render(close=True) env.close()
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] alive_agents = [10, 11, 12, 13] agent_id = 10 ammo = 1 blast_strength = 2 state = {} def __init__(self, env_config={}): pomme_config = pommerman.configs.ffa_competition_env() self.reward = Reward(env_config.get("reward")) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) self.init_state() def init_state(self): self.state['agent_id'] = self.agent_id self.state['alive'] = self.alive_agents self.state['visited'] = set() self.state['blast_strength'] = self.blast_strength self.state['ammo'] = self.ammo self.state["bombs"] = {} def reset(self): all_obs = self.pomme.reset() obs = self.get_for_training_agent(all_obs) self.init_state() self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] obs = self.build_obs(obs, self.state) return obs def step(self, action): actions = self.pomme.act(self.state['all_obs']) actions = self.set_for_training_agent(actions, action) all_obs, _, _, _ = self.pomme.step(actions) obs = self.get_for_training_agent(all_obs) info = {'board': obs['board'], 'blast_strength': obs['blast_strength']} done = self.get_done(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] self.state['blast_strength'] = obs['blast_strength'] self.state['ammo'] = obs['ammo'] obs = self.build_obs(obs, self.state) return obs, reward, done, info def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False def build_obs(self, obs, state): board = obs['board'] bomb_blast_strength = obs['bomb_blast_strength'] bomb_life = obs['bomb_life'] flame_life = obs['flame_life'] agent_id = state['agent_id'] ammo = state['ammo'] passage = np.zeros_like(board) wall = np.zeros_like(board) wood = np.zeros_like(board) bomb = np.zeros_like(board) bonus = np.zeros_like(board) me = np.zeros_like(board) enemy = np.zeros_like(board) for y in range(board.shape[0]): for x in range(board.shape[1]): v = board[y][x] if v == 0: passage[y][x] = 1 elif v == 1: wall[y][x] = 1 elif v == 2: wood[y][x] = 1 elif v == 3: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) elif v == 4: pass elif v == 6 or v == 7: bonus[y][x] = 1 elif v >= 10: if v == agent_id: me[y][x] = 1 else: enemy[y][x] = 1 if bomb_blast_strength[y][x] > 0: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) ammo = ammo * np.ones_like(board) / 12 bomb_life /= 9 flame_life /= 3 board = np.transpose( np.stack([ passage, wall, wood, bomb, bonus, me, enemy, bomb_life, flame_life, ammo ]), [1, 2, 0]) return board @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box( low=0, high=1, shape=(board_size, board_size, 10)) # passage,wall,wood,bomb,bonus,me,enemies bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) ammo = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo}) return board @staticmethod def init_action_space(): return spaces.Discrete(6) def render(self): self.pomme.render()
def main(): # Print all possible environments in the Pommerman registry # Instantiate the environment DETERMINISTIC = False VISUALIZE = False if args.test: DETERMINISTIC = True VISUALIZE = True config = ffa_competition_env() env = Pomme(**config["env_kwargs"]) env.seed(0) # Create a Proximal Policy Optimization agent with open('ppo.json', 'r') as fp: agent = json.load(fp=fp) with open('mlp2_lstm_network.json', 'r') as fp: network = json.load(fp=fp) agent = Agent.from_spec( spec=agent, kwargs=dict( states=dict(type='float', shape=env.observation_space.shape), actions=dict(type='int', num_actions=env.action_space.n), network=network ) ) # Add 3 random agents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) # Instantiate and run the environment for 5 episodes. if VISUALIZE: wrapped_env = WrappedEnv(env, True) else: wrapped_env = WrappedEnv(env) runner = Runner(agent=agent, environment=wrapped_env) rewards = [] episodes = [] def episode_finished(r): nonlocal episodes nonlocal rewards print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep, reward=r.episode_rewards[-1])) if r.episode % 1000 == 0: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) rewards = [] prev_data[1].extend(episodes) episodes = [] pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) if r.episode_rewards[-1] >= 5: print() print() print() print("WINNER WINNER CHICKEN DINNER") episodes.append(r.episode) rewards.append(r.episode_rewards[-1]) return True # Restore, Train, and Save Model if args.test or args.resume: # If test, change settings and restore model agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis') runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False) if not args.test: agent.save_model(('./{}').format(EXPERIMENT_NAME), False) print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:]) #Dump reward values try: prev_data = pickle.load(open(EXPERIMENT_NAME, "rb")) prev_len = len(prev_data[0]) prev_data[0].extend(rewards) prev_data[1].extend(episodes) print(episodes) pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb")) except (OSError, IOError) as e: pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb")) try: runner.close() except AttributeError as e: pass
class Pomme_v0(MultiAgentEnv): ''' A wrapped Pommerman v0 environment for usage with Ray RLlib. The v0 environment is the base environment used in the NIPS'18 competition. Contrary to v1 it doesn't collapse walls and also doesn't allow for radio communication between agents (as does v2). Agents are identified by (string) agent IDs: `AGENT_IDS` (Note that these "agents" here are not to be confused with RLlib agents.) ''' def __init__(self, config=pommerman_cfg.team_competition_env()): ''' Initializes the Pommerman environment and adds Dummy Agents as expected by `Pomme`. Args: config (dict): A config defining the game mode. Options include FFA mode, team (2v2) and team radio (2v2). See pommerman's config.py and docs for more details. ''' self.pomme = Pomme(**config['env_kwargs']) self.observation_space = dict self.action_space = self.pomme.action_space self.agent_names = AGENT_IDS agent_list = [] for i in range(4): agent_id = i agent_list.append( agents.BaseAgent(config["agent"](agent_id, config["game_type"]))) self.pomme.set_agents(agent_list) self.pomme.set_init_game_state(None) def reset(self): """ Resets the env and returns observations from ready agents. Returns: obs (dict): New observations for each ready agent. """ obs_list = self.pomme.reset() #return {key: featurize(val) for key, val in to_dict(obs_list).items()} return {key: val for key, val in to_dict(obs_list).items()} def step(self, action_dict): """ Returns observations from ready agents. The returns are dicts mapping from agent_id strings to values. The number of agents in the env can vary over time. Returns: obs (dict): New observations for each ready agent. rewards (dict): Reward values for each ready agent. If the episode is just started, the value will be zero. dones (dict): Done values for each ready agent. The key "__all__" is used to indicate the end of the game. infos (dict): Info values for each ready agent. """ # default actions since Pommerman env expects actions even if agent is dead actions = {'agent_0': 0, 'agent_1': 0, 'agent_2': 0, 'agent_3': 0} # update actions with the ones returned from the policies actions.update(action_dict) # perform env step (expects a list) obs, rewards, done, info = self.pomme.step(list(actions.values())) # build 'dones' dictionary, key __all__ indicates env termination dones = {'__all__': done} # fetch all done_agents = to_dict( [not agent.is_alive for agent in self.pomme._agents]) # filter done dictionary to only return agents which are still alive # -> apparently this is how rllib determines when agents "die" dones.update({key: val for key, val in done_agents.items() if not val}) # turn info dict into dictionary with agent IDs as keys infos = { AGENT_IDS[i]: {info_k: info_v for info_k, info_v in info.items()} for i in range(NUM_PLAYERS) } return to_dict(obs), to_dict(rewards), dones, infos
class PomFFA(gym.Env): agent_list = [ agents.RandomAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent() ] all_obs = None all_action = None cur_obs = None alive_agents = [10, 11, 12, 13] player_agent_id = 10 def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v print("pomme_config: ") print(pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs obs = self.preproess(obs) return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 1 else: return -1 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -1 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -1 # # x, y = obs["position"] # blast = obs["bomb_blast_strength"] # # for w in range(11): # if blast[x][w] > int(math.fabs(w-y)): # return -10 # # if blast[w][y] > int(math.fabs((w-x))): # return -10 return 0 def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) # print(obs) self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if (self.player_agent_id not in self.alive_agents) or obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] or 11 num_items = env_config['num_items'] or 11 print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] all_obs = None all_action = None pre_obs = None alive_agents = [10, 11, 12, 13] agent_id = 10 state = {} def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v self.reward = Reward(env_config.get("reward")) else: self.reward = Reward() print("Pommerman Config:", pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) state = { "prev_obs": None, "visited": set(), "agent_id": 10, "alive": [10, 11, 12, 13], "strength": 2, "ammo": 1, "bombs": {}, } state['prev_obs'] = copy.deepcopy(obs) state['position'] = obs['position'] self.state = state obs = self.preproess(obs) return obs def step(self, action): actions = self.pomme.act(self.all_obs) actions = self.set_for_training_agent(actions, action) obs, rewards, _, _ = self.pomme.step(actions) self.all_obs = obs obs = self.get_for_training_agent(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) done = self.get_done(obs) self.state['prev_obs'] = copy.deepcopy(obs) self.state['position'] = obs['position'] obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def init_action_space(): return spaces.Discrete(6) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
class PomFFA(gym.Env): agent_list = [HoldAgent(), HoldAgent(), HoldAgent(), HoldAgent()] all_obs = None all_action = None cur_obs = None alive_agents = [10, 11, 12, 13] player_agent_id = 10 def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v print("pomme_config: ") print(pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space self.total_reward = 0 self.prev_alive = 4 self.visited = np.zeros(shape=(11, 11)) if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs obs = self.preproess(obs) self.total_reward = 0 self.prev_alive = 4 self.visited = np.zeros(shape=(11, 11)) return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 1.0 - self.total_reward else: return -0.5 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -0.5 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -0.5 x, y = obs["position"] blast = obs["bomb_blast_strength"] px = [1, -1, 0, 0] py = [0, 0, -1, 1] sum_reward = 0.0 sum_reward += 20 * (len(obs["alive"]) - self.prev_alive) self.prev_alive = len(obs["alive"]) if action == 0: sum_reward -= 0.1 elif action == 5: # sum_reward += 1 for i in range(4): tx = x + px[i] ty = y + py[i] if tx < 0 or tx > 10 or ty < 0 or ty > 10: continue if obs["board"][tx][ty] == 1: sum_reward += 2 elif obs["board"][tx][ty] > 10: sum_reward += 4 else: assert (1 <= action <= 4), str(action) dx = x + px[action - 1] dy = y + py[action - 1] if (not (dx < 0 or dx > 10 or dy < 0 or dy > 10)) and obs["board"][dx][dy] == 0: if self.visited[dx][dy] > 0: sum_reward -= 0.1 else: sum_reward += 0.3 self.visited[dx][dy] = 1 sum_reward = sum_reward * 1.0 / 100.0 new_total_reward = self.total_reward + sum_reward if new_total_reward > 0.8 or new_total_reward < -0.5: sum_reward = 0.0 else: self.total_reward = new_total_reward return sum_reward def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) self.all_obs = obs obs = self.get_for_training_agent(obs) self.cur_obs = obs reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if (self.player_agent_id not in self.alive_agents) or obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order].copy() def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] or 11 num_items = env_config['num_items'] or 11 print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) danger = spaces.Box(low=0, high=20, shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=10, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) # return spaces.Dict({"board": board, # "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, # "flame_life": flame_life, # "position": position, "ammo": ammo, "blast_strength": blast_strength}) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength, "danger": danger }) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) board = obs['board'] bomb_blast_strength = obs['bomb_blast_strength'] bomb_life = obs['bomb_life'] # flame_life = obs['flame_life'] # position = obs['position'] # ammo = obs['ammo'] # blast_strength = obs['blast_strength'] danger = np.ndarray(shape=(11, 11), dtype=int) for x in range(11): for y in range(11): danger[x][y] = 10 if board[x][y] == 4: board[x][y] = 0 danger[x][y] = 0 elif board[x][y] == 3: board[x][y] = 0 elif board[x][y] == 10: board[x][y] = 1 elif board[x][y] > 10: board[x][y] = 5 elif 6 <= board[x][y] <= 8: board[x][y] = 3 elif board[x][y] == 1: board[x][y] = 4 for x in range(11): for y in range(11): if bomb_life[x][y] > 0: strength = int(bomb_blast_strength[x][y] + 0.5) for tx in range(max(0, x - strength + 1), min(11, x + strength)): danger[tx][y] = min(danger[tx][y], bomb_life[x][y]) for ty in range(max(0, y - strength + 1), min(11, y + strength)): danger[x][ty] = min(danger[x][ty], bomb_life[x][y]) obs['danger'] = danger return obs def render(self): self.pomme.render()
batch_size=100, learning_rate=1e-3, summarizer=dict( directory="./board5/", #steps=50, summaries='all')) # Add 3 SimpleAgents agents = [] for agent_id in range(3): agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"]))) # Add TensorforceAgent agent_id += 1 agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) # Instantiate and run the environment wrapped_env = WrappedEnv(env, env.observation_space, env.action_space, True, 600) runner = Runner(agent=agent, environment=wrapped_env, max_episode_timesteps=600) runner.run(num_episodes=15000) # Save agent model # - format: 'numpy' or 'hdf5' store only weights, 'checkpoint' stores full TensorFlow model runner.agent.save(directory="C:\\Users\\ali_k\\Desktop\\my_model", format='checkpoint')
class PomFFA(gym.Env): def __init__(self, env_config=None): self.agent_list = [HoldAgent(), agents.SimpleAgent(), HoldAgent(), HoldAgent()] # self.agent_list = [agents.SimpleAgent(), agents.SimpleAgent(), agents.SimpleAgent(), agents.RandomAgent()] self.all_obs = None self.all_action = None self.cur_obs = None self.alive_agents = [10, 11, 12, 13] self.player_agent_id = 10 self.total_reward = 0 pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space(pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) print(id_, pomm_config['game_type']) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs.copy() obs = self.get_for_training_agent(obs) self.cur_obs = obs.copy() obs = self.preproess(obs) self.total_reward = 0 return obs def get_reward(self, obs, action, agent_id): if len(obs["alive"]) == 1: # An agent won. Give them +1, others -1. if agent_id in obs['alive']: return 0.5 else: return -0.5 if obs["step_count"] >= 500: # Game is over from time. Everyone gets -1. return -0.5 # Game running: 0 for alive, -1 for dead. if agent_id not in obs['alive']: return -0.5 x, y = obs["position"] # blast = obs["bomb_blast_strength"] px = [0, 1, 0, -1] py = [1, 0, -1, 0] sum_reward = 0 if action == 5: for i in range(4): tx = x+px[i] ty = y+py[i] if tx<0 or tx>10 or ty<0 or ty>10: continue if obs["board"][tx][ty] == 1: sum_reward += 1 elif obs["board"][tx][ty] > 10: sum_reward += 4 sum_reward = sum_reward*1.0/200.0 new_total_reward = self.total_reward + sum_reward if new_total_reward > 0.5 or new_total_reward < -0.5: sum_reward = 0 else: self.total_reward = new_total_reward return sum_reward def step(self, action): actions = self.pomme.act(self.all_obs) if self.alive_agents and self.player_agent_id in self.alive_agents: actions = self.set_for_training_agent(actions, action) else: actions = self.set_for_training_agent(actions, 0) obs, rewards, done, info = self.pomme.step(actions) # print(obs) del self.all_obs self.all_obs = obs.copy() obs = self.get_for_training_agent(obs) del self.cur_obs self.cur_obs = obs.copy() reward = self.get_reward(self.cur_obs, action, self.player_agent_id) self.alive_agents = obs['alive'] if self.player_agent_id not in self.alive_agents or self.cur_obs["step_count"] >= 500: done = True obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.player_agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.player_agent_id - 10 inputs[order] = value return inputs def init_observation_space(self, env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] # print("env config: {}".format(env_config)) # board_size = 11 board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2,)) blast_strength = spaces.Box(low=1, high=num_items, shape=(1,)) ammo = spaces.Box(low=0, high=num_items, shape=(1,)) return spaces.Dict({"board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength}) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
# Instantiate the environment config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) env.action_space.n # Add 3 random agents agents = {} for agent_id in range(3): agents[agent_id] = StaticAgent(config["agent"](agent_id, config["game_type"])) # Add human agent agent_id += 1 agents[3] = PlayerAgent(config["agent"](agent_id, config["game_type"]), "arrows") env.set_agents(list(agents.values())) env.set_init_game_state(None) # Seed and reset the environment env.seed(0) obs = env.reset() # Run the agents until we're done done = False while not done: env.render() actions = env.act(obs) # brauch ich nicht #actions = [action % 4 for action in actions] #actions = [0,actions[1]] obs, reward, done, info = env.step(actions)
def main(args): version = 'v1' episodes = args.episodes visualize = args.visualize config = ffa_v0_fast_env() env = Pomme(**config["env_kwargs"]) env.seed(0) agent = PPOAgent( states=dict(type='float', shape=(11, 11, 12)), actions=dict(type='int', num_actions=env.action_space.n), network=[ # (9, 9, 12) dict(type='conv2d', size=12, window=3, stride=1), # (7, 7, 8) dict(type='conv2d', size=8, window=3, stride=1), # (5, 5, 4) dict(type='conv2d', size=4, window=3, stride=1), # (100) dict(type='flatten'), dict(type='dense', size=64, activation='relu'), dict(type='dense', size=16, activation='relu'), ], batching_capacity=1000, step_optimizer=dict(type='adam', learning_rate=1e-4)) if os.path.exists(os.path.join('models', version, 'checkpoint')): agent.restore_model(directory=os.path.join('models', version)) agents = [] for agent_id in range(3): # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"]))) # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"]))) agents.append( SimpleAgent(config["agent"](agent_id, config["game_type"]))) agent_id += 1 agents.append( TensorforceAgent(config["agent"](agent_id, config["game_type"]))) env.set_agents(agents) env.set_training_agent(agents[-1].agent_id) env.set_init_game_state(None) wrapped_env = WrappedEnv(env, agent, visualize) runner = Runner(agent=agent, environment=wrapped_env) try: runner.run(episodes=episodes, max_episode_timesteps=100) except Exception as e: raise e finally: agent.save_model(directory=os.path.join('models', version, 'agent')) win_count = len( list(filter(lambda reward: reward == 1, runner.episode_rewards))) print('Stats: ') print(f' runner.episode_rewards = {runner.episode_rewards}') print(f' win count = {win_count}') try: runner.close() except AttributeError as e: raise e