def random_encounter( agent=Agent, foe=Foe, max_turns=int, ) -> pd.DataFrame: """ :param agent: :param foe: :param max_turns: :return: """ reward = Reward(agent, foe) utility = reward.get_reward(agent, foe) # Arrays to hold encounter_stats agent_policies = [] agent_spell_slots = [] agent_shields = [] agent_healths = [] foe_healths = [] rewards = [] for i in range(max_turns): agent_policy = agent.random_action() agent_action, __ = turn(agent, agent_policy, foe) utility = reward.get_reward(agent, foe) # Collect turn data into encounter_stats agent_policies.append(agent_policy) agent_spell_slots.append(agent.states["spell slots"]) agent_shields.append(agent.states["shield"]) agent_healths.append(agent.hp) foe_healths.append(foe.hp) rewards.append(utility) if agent.hp <= 0 or foe.hp <= 0: # end encounter if either dies break encounter_stats = pd.DataFrame({ "agent actions": agent_policies, "agent spell slots": agent_spell_slots, "agent shield": agent_shields, "agent health": agent_healths, "foe health": foe_healths, "reward": rewards, }) return agent, foe, encounter_stats
def __lookahead( agent: Agent, # Agent and foe represent the full state foe: Foe, policy_step: str, reward: Reward, discount: float) -> float: agent_copy = copy.deepcopy(agent) foe_copy = copy.deepcopy(foe) utility = reward.get_reward(agent_copy, foe_copy) turn(agent_copy, policy_step, foe_copy, "expectation") return utility + discount * reward.get_reward(agent_copy, foe_copy)
def encounter( agent=Agent, foe=Foe, max_turns=int, forward_search_and_reward_kwargs={}, ) -> pd.DataFrame: """ TODO: document me!! :param agent: :param foe: :param max_turns: :param forward_search_and_reward_kwargs: - forward_search_kwargs: - depth: int = 3, - reward_kwargs: - reward_for_kill: float = 1000, - penalty_for_dying: float = -1000, - agent_hp_bonus: float = 2, - foe_hp_bonus: float = -1 :return: """ # Handle kwargs forward_search_kwargs, reward_kwargs = __get_kwargs( forward_search_and_reward_kwargs) reward = Reward(agent, foe, **reward_kwargs) utility = reward.get_reward(agent, foe) faux_foe = Foe() # The belief state of our foe # Arrays to hold encounter_stats agent_policies = [] agent_spell_slots = [] agent_shields = [] agent_healths = [] foe_healths = [] foe_reactions = [] faux_foe_healths = [] forward_search_utilities = [] rewards = [] for i in range(max_turns): agent_policy, forward_search_utility = forward_search( agent=copy.deepcopy(agent), foe=copy.deepcopy(faux_foe), reward=reward, utility=utility, **forward_search_kwargs) agent_action, foe_reaction = turn(agent, agent_policy, foe) faux_foe = update_foe_belief(faux_foe, foe_reaction) utility += reward.get_reward(agent, foe) # Collect turn data into encounter_stats agent_policies.append(agent_policy) agent_spell_slots.append(agent.states["spell slots"]) agent_shields.append(agent.states["shield"]) agent_healths.append(agent.hp) foe_healths.append(foe.hp) foe_reactions.append(foe_reaction) faux_foe_healths.append(faux_foe.hp) forward_search_utilities.append(forward_search_utility) rewards.append(utility) if agent.hp <= 0 or foe.hp <= 0: # end encounter if either dies break encounter_stats = pd.DataFrame({ "agent actions": agent_policies, "agent spell slots": agent_spell_slots, "agent shield": agent_shields, "agent health": agent_healths, "foe health": foe_healths, "foe reactions": foe_reactions, "faux foe health": faux_foe_healths, "forward search utility": forward_search_utilities, "utility": rewards, }) return agent, foe, encounter_stats
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] alive_agents = [10, 11, 12, 13] agent_id = 10 ammo = 1 blast_strength = 2 state = {} def __init__(self, env_config={}): pomme_config = pommerman.configs.ffa_competition_env() self.reward = Reward(env_config.get("reward")) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) self.init_state() def init_state(self): self.state['agent_id'] = self.agent_id self.state['alive'] = self.alive_agents self.state['visited'] = set() self.state['blast_strength'] = self.blast_strength self.state['ammo'] = self.ammo self.state["bombs"] = {} def reset(self): all_obs = self.pomme.reset() obs = self.get_for_training_agent(all_obs) self.init_state() self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] obs = self.build_obs(obs, self.state) return obs def step(self, action): actions = self.pomme.act(self.state['all_obs']) actions = self.set_for_training_agent(actions, action) all_obs, _, _, _ = self.pomme.step(actions) obs = self.get_for_training_agent(all_obs) info = {'board': obs['board'], 'blast_strength': obs['blast_strength']} done = self.get_done(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) self.state['prev_obs'] = copy.deepcopy(obs) self.state['all_obs'] = all_obs self.state['alive'] = obs['alive'] self.state['blast_strength'] = obs['blast_strength'] self.state['ammo'] = obs['ammo'] obs = self.build_obs(obs, self.state) return obs, reward, done, info def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False def build_obs(self, obs, state): board = obs['board'] bomb_blast_strength = obs['bomb_blast_strength'] bomb_life = obs['bomb_life'] flame_life = obs['flame_life'] agent_id = state['agent_id'] ammo = state['ammo'] passage = np.zeros_like(board) wall = np.zeros_like(board) wood = np.zeros_like(board) bomb = np.zeros_like(board) bonus = np.zeros_like(board) me = np.zeros_like(board) enemy = np.zeros_like(board) for y in range(board.shape[0]): for x in range(board.shape[1]): v = board[y][x] if v == 0: passage[y][x] = 1 elif v == 1: wall[y][x] = 1 elif v == 2: wood[y][x] = 1 elif v == 3: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) elif v == 4: pass elif v == 6 or v == 7: bonus[y][x] = 1 elif v >= 10: if v == agent_id: me[y][x] = 1 else: enemy[y][x] = 1 if bomb_blast_strength[y][x] > 0: bomb = create_cross(bomb, (y, x), bomb_blast_strength[y][x]) ammo = ammo * np.ones_like(board) / 12 bomb_life /= 9 flame_life /= 3 board = np.transpose( np.stack([ passage, wall, wood, bomb, bonus, me, enemy, bomb_life, flame_life, ammo ]), [1, 2, 0]) return board @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box( low=0, high=1, shape=(board_size, board_size, 10)) # passage,wall,wood,bomb,bonus,me,enemies bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) ammo = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo}) return board @staticmethod def init_action_space(): return spaces.Discrete(6) def render(self): self.pomme.render()
class PomFFA(gym.Env): agent_list = [ agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent(), agents.StaticAgent() ] all_obs = None all_action = None pre_obs = None alive_agents = [10, 11, 12, 13] agent_id = 10 state = {} def __init__(self, env_config=None): pomme_config = pommerman.configs.ffa_competition_env() if env_config: for k, v in env_config.items(): if k in pomme_config['env_kwargs']: pomme_config['env_kwargs'][k] = v self.reward = Reward(env_config.get("reward")) else: self.reward = Reward() print("Pommerman Config:", pomme_config['env_kwargs']) self.pomme = Pomme(**pomme_config['env_kwargs']) self.observation_space = self.init_observation_space( pomme_config['env_kwargs']) self.action_space = self.pomme.action_space if not env_config or (env_config and env_config.get("is_training", True)): # initialize env twice could raise error here. self.init(pomme_config) def init(self, pomm_config): for id_, agent in enumerate(self.agent_list): assert isinstance(agent, agents.BaseAgent) agent.init_agent(id_, pomm_config['game_type']) self.pomme.set_agents(self.agent_list) self.pomme.set_init_game_state(None) def reset(self): obs = self.pomme.reset() self.all_obs = obs obs = self.get_for_training_agent(obs) state = { "prev_obs": None, "visited": set(), "agent_id": 10, "alive": [10, 11, 12, 13], "strength": 2, "ammo": 1, "bombs": {}, } state['prev_obs'] = copy.deepcopy(obs) state['position'] = obs['position'] self.state = state obs = self.preproess(obs) return obs def step(self, action): actions = self.pomme.act(self.all_obs) actions = self.set_for_training_agent(actions, action) obs, rewards, _, _ = self.pomme.step(actions) self.all_obs = obs obs = self.get_for_training_agent(obs) reward, self.state = self.reward.get_reward(action, obs, self.state) done = self.get_done(obs) self.state['prev_obs'] = copy.deepcopy(obs) self.state['position'] = obs['position'] obs = self.preproess(obs) return obs, reward, done, {} def get_for_training_agent(self, inputs): order = self.agent_id - 10 return inputs[order] def set_for_training_agent(self, inputs, value): order = self.agent_id - 10 inputs[order] = value return inputs def get_done(self, obs): if self.agent_id not in obs['alive']: return True if obs['step_count'] >= 800: return True return False @staticmethod def init_observation_space(env_config): """ observations for agents board: n^2 bomb blast strength: n^2 bomb life: n^2 """ board_size = env_config['board_size'] num_items = env_config['num_items'] board = spaces.Box(low=0, high=len(constants.Item), shape=(board_size, board_size)) bomb_blast_strength = spaces.Box(low=0, high=num_items, shape=(board_size, board_size)) bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size)) flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size)) position = spaces.Box(low=0, high=board_size, shape=(2, )) blast_strength = spaces.Box(low=1, high=num_items, shape=(1, )) ammo = spaces.Box(low=0, high=num_items, shape=(1, )) return spaces.Dict({ "board": board, "bomb_blast_strength": bomb_blast_strength, "bomb_life": bomb_life, "flame_life": flame_life, "position": position, "ammo": ammo, "blast_strength": blast_strength }) @staticmethod def init_action_space(): return spaces.Discrete(6) @staticmethod def preproess(obs): del obs["game_type"] del obs["game_env"] del obs["can_kick"] del obs["teammate"] del obs["enemies"] del obs["step_count"] del obs['alive'] del obs['bomb_moving_direction'] obs['position'] = np.array(obs['position']) obs['ammo'] = np.array([obs['ammo']]) obs['blast_strength'] = np.array([obs['blast_strength']]) return obs def render(self): self.pomme.render()
def main(): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu:0') configuration = Configuration('project.config') config = configuration.get_config_options() # create_vocabulary_from_dataset(config) # make_target_vocab(config) word2idx, dataset_vectors = load_target_vocab(config) use_safe_dataset = True dataset = TextDataset(word2idx, dataset_vectors, config=config) if use_safe_dataset: dataset = nc.SafeDataset(dataset) data_loader = nc.SafeDataLoader(dataset=dataset, batch_size=config.globals.BATCH_SIZE, num_workers=0, shuffle=True) else: data_loader = datautil.DataLoader(dataset=dataset, batch_size=config.globals.BATCH_SIZE, num_workers=0, shuffle=False) # model = SentenceEncoder(target_vocab = word2idx.keys(), vectors = dataset_vectors, config = config) # doc_enc = DocumentEncoder(config=config) policy_net = DQN(target_vocab=word2idx.keys(), vectors=dataset_vectors, config=config).to(device) target_net = DQN(target_vocab=word2idx.keys(), vectors=dataset_vectors, config=config).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() reward_func = Reward() h = reward_func.get_reward([['hello']], [[['this is a good hello']]]) print(h) def select_action(config, doc, state): # TODO: fix the function to handle the full batchsize # TODO: send all tensors to GPU sample = np.random.random() # article = '\n'.join(doc['raw']) # article = article.split('\n\n') doc_tensor = doc['tensor'][:, :len(doc['raw']) - 1] # Putting this here as we need q_values one way or the other q_values = policy_net(doc_tensor, get_q_approx=True, sum_i=state['sum_i']) # Decay the epsilon per EPS_DECAY_ITER iterations if iter % config.dqn.EPS_DECAY_ITER == 0: config.dqn.EPS_START -= config.dqn.EPS_DECAY print('EPSILON Decayed to : ', config.dqn.EPS_START) if sample < config.dqn.EPS_START: i = np.random.randint(low=0, high=len(doc['raw']) - 1) else: # actions are sentences i = torch.argmax(q_values, dim=1) a_i = (i, doc['raw'][i]) return a_i, q_values optimizer = torch.optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(config.dqn.REPLAY_MEM_SIZE) epoch = 0 iter = 0 for epoch in tqdm(range(epoch, config.globals.NUM_EPOCHS)): policy_net.train() for i, (story, highlights) in tqdm(enumerate(data_loader)): state = { 'curr_summary_ids': [], 'curr_summary': [], 'sum_i': torch.zeros((100)) } next_state = state prev_r_i, r_i = 0 # locking to 10 for simplicity purposes for i in count(config.dqn.SUMMARY_LENGTH): iter = iter + 1 # if i>20 : break story['tensor'] = story['tensor'].to(device) highlights['tensor'] = highlights['tensor'].to(device) # sentence representation are calculated as no grad because we dont want to disturb / update the weights with torch.no_grad(): H_i, D_i, x = policy_net( story['tensor'][:, :len(story['raw']) - 1]) a_i, q_values = select_action(config, story, state) next_state['curr_summary_ids'].append(int(a_i[0])) next_state['curr_summary'].append(a_i[1]) next_state['sum_i'] = Sum_i(H_i, state['curr_summary_ids'], q_values) r_i = reward_func.get_reward([next_state['curr_summary']], gold_summ=[[highlights['raw']]], **{ 'prev_score': prev_r_i, 'config': config }) prev_r_i = r_i # checks if we are close to the summ length part done = check_done(config, next_state) if done: next_state = None # TODO: check which a_i has to be loaded , a_i[0] or a_i[1] or just a_i memory.push(state, H_i[a_i[0]], next_state, r_i) state = next_state optimize_model(config) if done: break