コード例 #1
0
def random_encounter(
    agent=Agent,
    foe=Foe,
    max_turns=int,
) -> pd.DataFrame:
    """
    :param agent:
    :param foe:
    :param max_turns:
    :return:
    """

    reward = Reward(agent, foe)
    utility = reward.get_reward(agent, foe)

    # Arrays to hold encounter_stats
    agent_policies = []
    agent_spell_slots = []
    agent_shields = []
    agent_healths = []
    foe_healths = []
    rewards = []

    for i in range(max_turns):

        agent_policy = agent.random_action()

        agent_action, __ = turn(agent, agent_policy, foe)

        utility = reward.get_reward(agent, foe)

        # Collect turn data into encounter_stats
        agent_policies.append(agent_policy)
        agent_spell_slots.append(agent.states["spell slots"])
        agent_shields.append(agent.states["shield"])
        agent_healths.append(agent.hp)
        foe_healths.append(foe.hp)
        rewards.append(utility)

        if agent.hp <= 0 or foe.hp <= 0:
            # end encounter if either dies
            break

    encounter_stats = pd.DataFrame({
        "agent actions": agent_policies,
        "agent spell slots": agent_spell_slots,
        "agent shield": agent_shields,
        "agent health": agent_healths,
        "foe health": foe_healths,
        "reward": rewards,
    })

    return agent, foe, encounter_stats
コード例 #2
0
def __lookahead(
        agent: Agent,  # Agent and foe represent the full state
        foe: Foe,
        policy_step: str,
        reward: Reward,
        discount: float) -> float:

    agent_copy = copy.deepcopy(agent)
    foe_copy = copy.deepcopy(foe)

    utility = reward.get_reward(agent_copy, foe_copy)

    turn(agent_copy, policy_step, foe_copy, "expectation")

    return utility + discount * reward.get_reward(agent_copy, foe_copy)
コード例 #3
0
def encounter(
    agent=Agent,
    foe=Foe,
    max_turns=int,
    forward_search_and_reward_kwargs={},
) -> pd.DataFrame:
    """
    TODO: document me!!

    :param agent:
    :param foe:
    :param max_turns:
    :param forward_search_and_reward_kwargs:
        - forward_search_kwargs:
            - depth: int = 3,
        - reward_kwargs:
            - reward_for_kill: float = 1000,
            - penalty_for_dying: float = -1000,
            - agent_hp_bonus: float = 2,
            - foe_hp_bonus: float = -1
    :return:
    """

    # Handle kwargs
    forward_search_kwargs, reward_kwargs = __get_kwargs(
        forward_search_and_reward_kwargs)

    reward = Reward(agent, foe, **reward_kwargs)
    utility = reward.get_reward(agent, foe)

    faux_foe = Foe()  # The belief state of our foe

    # Arrays to hold encounter_stats
    agent_policies = []
    agent_spell_slots = []
    agent_shields = []
    agent_healths = []
    foe_healths = []
    foe_reactions = []
    faux_foe_healths = []
    forward_search_utilities = []
    rewards = []

    for i in range(max_turns):

        agent_policy, forward_search_utility = forward_search(
            agent=copy.deepcopy(agent),
            foe=copy.deepcopy(faux_foe),
            reward=reward,
            utility=utility,
            **forward_search_kwargs)

        agent_action, foe_reaction = turn(agent, agent_policy, foe)

        faux_foe = update_foe_belief(faux_foe, foe_reaction)
        utility += reward.get_reward(agent, foe)

        # Collect turn data into encounter_stats
        agent_policies.append(agent_policy)
        agent_spell_slots.append(agent.states["spell slots"])
        agent_shields.append(agent.states["shield"])
        agent_healths.append(agent.hp)
        foe_healths.append(foe.hp)
        foe_reactions.append(foe_reaction)
        faux_foe_healths.append(faux_foe.hp)
        forward_search_utilities.append(forward_search_utility)
        rewards.append(utility)

        if agent.hp <= 0 or foe.hp <= 0:
            # end encounter if either dies
            break

    encounter_stats = pd.DataFrame({
        "agent actions": agent_policies,
        "agent spell slots": agent_spell_slots,
        "agent shield": agent_shields,
        "agent health": agent_healths,
        "foe health": foe_healths,
        "foe reactions": foe_reactions,
        "faux foe health": faux_foe_healths,
        "forward search utility": forward_search_utilities,
        "utility": rewards,
    })

    return agent, foe, encounter_stats
コード例 #4
0
class PomFFA(gym.Env):
    agent_list = [
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent()
    ]
    alive_agents = [10, 11, 12, 13]
    agent_id = 10
    ammo = 1
    blast_strength = 2
    state = {}

    def __init__(self, env_config={}):
        pomme_config = pommerman.configs.ffa_competition_env()
        self.reward = Reward(env_config.get("reward"))

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)
        self.init_state()

    def init_state(self):
        self.state['agent_id'] = self.agent_id
        self.state['alive'] = self.alive_agents
        self.state['visited'] = set()
        self.state['blast_strength'] = self.blast_strength
        self.state['ammo'] = self.ammo
        self.state["bombs"] = {}

    def reset(self):
        all_obs = self.pomme.reset()
        obs = self.get_for_training_agent(all_obs)
        self.init_state()

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']

        obs = self.build_obs(obs, self.state)
        return obs

    def step(self, action):
        actions = self.pomme.act(self.state['all_obs'])
        actions = self.set_for_training_agent(actions, action)

        all_obs, _, _, _ = self.pomme.step(actions)
        obs = self.get_for_training_agent(all_obs)
        info = {'board': obs['board'], 'blast_strength': obs['blast_strength']}
        done = self.get_done(obs)
        reward, self.state = self.reward.get_reward(action, obs, self.state)

        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['all_obs'] = all_obs
        self.state['alive'] = obs['alive']
        self.state['blast_strength'] = obs['blast_strength']
        self.state['ammo'] = obs['ammo']

        obs = self.build_obs(obs, self.state)
        return obs, reward, done, info

    def get_for_training_agent(self, inputs):
        order = self.agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.agent_id - 10
        inputs[order] = value
        return inputs

    def get_done(self, obs):
        if self.agent_id not in obs['alive']:
            return True
        if obs['step_count'] >= 800:
            return True
        return False

    def build_obs(self, obs, state):
        board = obs['board']
        bomb_blast_strength = obs['bomb_blast_strength']
        bomb_life = obs['bomb_life']
        flame_life = obs['flame_life']
        agent_id = state['agent_id']
        ammo = state['ammo']
        passage = np.zeros_like(board)
        wall = np.zeros_like(board)
        wood = np.zeros_like(board)
        bomb = np.zeros_like(board)
        bonus = np.zeros_like(board)
        me = np.zeros_like(board)
        enemy = np.zeros_like(board)
        for y in range(board.shape[0]):
            for x in range(board.shape[1]):
                v = board[y][x]
                if v == 0:
                    passage[y][x] = 1
                elif v == 1:
                    wall[y][x] = 1
                elif v == 2:
                    wood[y][x] = 1
                elif v == 3:
                    bomb = create_cross(bomb, (y, x),
                                        bomb_blast_strength[y][x])
                elif v == 4:
                    pass
                elif v == 6 or v == 7:
                    bonus[y][x] = 1
                elif v >= 10:
                    if v == agent_id:
                        me[y][x] = 1
                    else:
                        enemy[y][x] = 1
                    if bomb_blast_strength[y][x] > 0:
                        bomb = create_cross(bomb, (y, x),
                                            bomb_blast_strength[y][x])

        ammo = ammo * np.ones_like(board) / 12
        bomb_life /= 9
        flame_life /= 3
        board = np.transpose(
            np.stack([
                passage, wall, wood, bomb, bonus, me, enemy, bomb_life,
                flame_life, ammo
            ]), [1, 2, 0])
        return board

    @staticmethod
    def init_observation_space(env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size']
        num_items = env_config['num_items']

        board = spaces.Box(
            low=0, high=1,
            shape=(board_size, board_size,
                   10))  # passage,wall,wood,bomb,bonus,me,enemies
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        ammo = spaces.Box(low=0,
                          high=num_items,
                          shape=(board_size, board_size))
        # return spaces.Dict({"board": board, "bomb_life": bomb_life, "flame_life": flame_life,"ammo": ammo})
        return board

    @staticmethod
    def init_action_space():
        return spaces.Discrete(6)

    def render(self):
        self.pomme.render()
コード例 #5
0
ファイル: env.py プロジェクト: jiarongqiu/Pommerman-RLlib
class PomFFA(gym.Env):
    agent_list = [
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent(),
        agents.StaticAgent()
    ]
    all_obs = None
    all_action = None
    pre_obs = None
    alive_agents = [10, 11, 12, 13]
    agent_id = 10
    state = {}

    def __init__(self, env_config=None):

        pomme_config = pommerman.configs.ffa_competition_env()

        if env_config:
            for k, v in env_config.items():
                if k in pomme_config['env_kwargs']:
                    pomme_config['env_kwargs'][k] = v
            self.reward = Reward(env_config.get("reward"))
        else:
            self.reward = Reward()

        print("Pommerman Config:", pomme_config['env_kwargs'])

        self.pomme = Pomme(**pomme_config['env_kwargs'])

        self.observation_space = self.init_observation_space(
            pomme_config['env_kwargs'])
        self.action_space = self.pomme.action_space

        if not env_config or (env_config
                              and env_config.get("is_training", True)):
            # initialize env twice could raise error here.
            self.init(pomme_config)

    def init(self, pomm_config):
        for id_, agent in enumerate(self.agent_list):
            assert isinstance(agent, agents.BaseAgent)
            agent.init_agent(id_, pomm_config['game_type'])
        self.pomme.set_agents(self.agent_list)
        self.pomme.set_init_game_state(None)

    def reset(self):
        obs = self.pomme.reset()
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        state = {
            "prev_obs": None,
            "visited": set(),
            "agent_id": 10,
            "alive": [10, 11, 12, 13],
            "strength": 2,
            "ammo": 1,
            "bombs": {},
        }
        state['prev_obs'] = copy.deepcopy(obs)
        state['position'] = obs['position']
        self.state = state
        obs = self.preproess(obs)
        return obs

    def step(self, action):
        actions = self.pomme.act(self.all_obs)
        actions = self.set_for_training_agent(actions, action)

        obs, rewards, _, _ = self.pomme.step(actions)
        self.all_obs = obs
        obs = self.get_for_training_agent(obs)
        reward, self.state = self.reward.get_reward(action, obs, self.state)
        done = self.get_done(obs)
        self.state['prev_obs'] = copy.deepcopy(obs)
        self.state['position'] = obs['position']
        obs = self.preproess(obs)

        return obs, reward, done, {}

    def get_for_training_agent(self, inputs):
        order = self.agent_id - 10
        return inputs[order]

    def set_for_training_agent(self, inputs, value):
        order = self.agent_id - 10
        inputs[order] = value
        return inputs

    def get_done(self, obs):
        if self.agent_id not in obs['alive']:
            return True
        if obs['step_count'] >= 800:
            return True
        return False

    @staticmethod
    def init_observation_space(env_config):
        """
            observations for agents
            board: n^2
            bomb blast strength: n^2
            bomb life: n^2
        """
        board_size = env_config['board_size']
        num_items = env_config['num_items']

        board = spaces.Box(low=0,
                           high=len(constants.Item),
                           shape=(board_size, board_size))
        bomb_blast_strength = spaces.Box(low=0,
                                         high=num_items,
                                         shape=(board_size, board_size))
        bomb_life = spaces.Box(low=0, high=9, shape=(board_size, board_size))
        flame_life = spaces.Box(low=0, high=3, shape=(board_size, board_size))
        position = spaces.Box(low=0, high=board_size, shape=(2, ))
        blast_strength = spaces.Box(low=1, high=num_items, shape=(1, ))
        ammo = spaces.Box(low=0, high=num_items, shape=(1, ))
        return spaces.Dict({
            "board": board,
            "bomb_blast_strength": bomb_blast_strength,
            "bomb_life": bomb_life,
            "flame_life": flame_life,
            "position": position,
            "ammo": ammo,
            "blast_strength": blast_strength
        })

    @staticmethod
    def init_action_space():
        return spaces.Discrete(6)

    @staticmethod
    def preproess(obs):
        del obs["game_type"]
        del obs["game_env"]
        del obs["can_kick"]
        del obs["teammate"]
        del obs["enemies"]
        del obs["step_count"]
        del obs['alive']
        del obs['bomb_moving_direction']

        obs['position'] = np.array(obs['position'])
        obs['ammo'] = np.array([obs['ammo']])
        obs['blast_strength'] = np.array([obs['blast_strength']])

        return obs

    def render(self):
        self.pomme.render()
コード例 #6
0
def main():

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu:0')

    configuration = Configuration('project.config')
    config = configuration.get_config_options()

    # create_vocabulary_from_dataset(config)
    # make_target_vocab(config)

    word2idx, dataset_vectors = load_target_vocab(config)

    use_safe_dataset = True
    dataset = TextDataset(word2idx, dataset_vectors, config=config)
    if use_safe_dataset:
        dataset = nc.SafeDataset(dataset)
        data_loader = nc.SafeDataLoader(dataset=dataset,
                                        batch_size=config.globals.BATCH_SIZE,
                                        num_workers=0,
                                        shuffle=True)
    else:
        data_loader = datautil.DataLoader(dataset=dataset,
                                          batch_size=config.globals.BATCH_SIZE,
                                          num_workers=0,
                                          shuffle=False)

    # model = SentenceEncoder(target_vocab = word2idx.keys(), vectors = dataset_vectors, config = config)
    # doc_enc = DocumentEncoder(config=config)

    policy_net = DQN(target_vocab=word2idx.keys(),
                     vectors=dataset_vectors,
                     config=config).to(device)
    target_net = DQN(target_vocab=word2idx.keys(),
                     vectors=dataset_vectors,
                     config=config).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    reward_func = Reward()
    h = reward_func.get_reward([['hello']], [[['this is a good hello']]])
    print(h)

    def select_action(config, doc, state):
        # TODO: fix the function to handle the full batchsize
        # TODO: send all tensors to GPU

        sample = np.random.random()

        # article = '\n'.join(doc['raw'])
        # article = article.split('\n\n')
        doc_tensor = doc['tensor'][:, :len(doc['raw']) - 1]
        # Putting this here as we need q_values one way or the other
        q_values = policy_net(doc_tensor,
                              get_q_approx=True,
                              sum_i=state['sum_i'])

        # Decay the epsilon per EPS_DECAY_ITER iterations
        if iter % config.dqn.EPS_DECAY_ITER == 0:
            config.dqn.EPS_START -= config.dqn.EPS_DECAY
            print('EPSILON Decayed to : ', config.dqn.EPS_START)

        if sample < config.dqn.EPS_START:
            i = np.random.randint(low=0, high=len(doc['raw']) - 1)
        else:
            # actions are sentences
            i = torch.argmax(q_values, dim=1)

        a_i = (i, doc['raw'][i])
        return a_i, q_values

    optimizer = torch.optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(config.dqn.REPLAY_MEM_SIZE)

    epoch = 0
    iter = 0

    for epoch in tqdm(range(epoch, config.globals.NUM_EPOCHS)):
        policy_net.train()
        for i, (story, highlights) in tqdm(enumerate(data_loader)):
            state = {
                'curr_summary_ids': [],
                'curr_summary': [],
                'sum_i': torch.zeros((100))
            }
            next_state = state
            prev_r_i, r_i = 0
            # locking to 10 for simplicity purposes
            for i in count(config.dqn.SUMMARY_LENGTH):
                iter = iter + 1

                # if i>20 : break

                story['tensor'] = story['tensor'].to(device)
                highlights['tensor'] = highlights['tensor'].to(device)
                # sentence representation are calculated as no grad because we dont want to disturb / update the weights
                with torch.no_grad():
                    H_i, D_i, x = policy_net(
                        story['tensor'][:, :len(story['raw']) - 1])

                a_i, q_values = select_action(config, story, state)

                next_state['curr_summary_ids'].append(int(a_i[0]))
                next_state['curr_summary'].append(a_i[1])
                next_state['sum_i'] = Sum_i(H_i, state['curr_summary_ids'],
                                            q_values)
                r_i = reward_func.get_reward([next_state['curr_summary']],
                                             gold_summ=[[highlights['raw']]],
                                             **{
                                                 'prev_score': prev_r_i,
                                                 'config': config
                                             })
                prev_r_i = r_i
                # checks if we are close to the summ length part
                done = check_done(config, next_state)
                if done:
                    next_state = None
                # TODO: check which a_i has to be loaded , a_i[0] or a_i[1] or just a_i
                memory.push(state, H_i[a_i[0]], next_state, r_i)
                state = next_state
                optimize_model(config)

                if done:
                    break