コード例 #1
0
def agent(obs, config, prev_actions=None):
    if prev_actions is None:
        try:
            prev_actions = agent.prev_actions
        except AttributeError:
            agent.prev_actions = [None] * args.numAgents
            prev_actions = agent.prev_actions

    player = obs.index

    board = get_board(obs, prev_actions, args)
    board = get_player_board(board, player, args.numAgents)

    pi, _ = agent.net.predict(board)

    # remove invalid action
    prev_action = prev_actions[player]
    if prev_action is not None:
        prev_action = str_to_action(prev_action)
        oppo_action = Action.opposite(prev_action).value
        pi[oppo_action - 1] = 0
        pi /= pi.sum()

    action = np.argmax(pi) + 1
    action = Action(action).name
    prev_actions[player] = action
    return action
コード例 #2
0
    def agent_strategy(self, observation, configuration):
        state = self.state_from_world()

        #Process reward for growing
        reward = len(
            self.my_body) + 2 * self.step  #Geese really like pizza!!! 8-)
        self.previous_length = len(self.my_body)
        self.process_reward(reward)

        #Choose action
        action = self.epsilon_greedy_choose_action(state)

        #Apply some common sense like colliding is bad... ;-)
        cs_reward = self.common_sense_after_move_choosen(action)
        if cs_reward < 0:
            #update q-table
            self.process_reward(reward,
                                previous_state=state,
                                last_action=action,
                                last_action_index=self.actions.index(action))

            #choose new greedy risk averse valid action
            random_action = self.strategy_greedy_avoid_risk(
                observation, configuration)
            #update internal action attributes
            aux = [(action, index) for index, action in enumerate(Action)
                   if action.name == random_action][0]
            self.last_action = aux[0]
            self.last_action_index = aux[1]
            action = self.last_action
        print(f'q-agent q_table{self.q_table}', flush=True)

        self.previous_action = action
        return Action(action).name
コード例 #3
0
def select_action(pi, prev_action):
    '''
    Args:
        pi: np.array(4)
        prev_action: one of 0, 1, 2, 3
    Return:
        action: str
    '''
    if prev_action is not None:
        prev_action = str_to_action(prev_action)
        invalid_action = Action.opposite(prev_action)
        pi[invalid_action.value - 1] = 0
        pi /= np.sum(pi)
    action_num = np.random.choice(len(pi), p=pi) + 1
    action = Action(action_num).name
    return action
コード例 #4
0
 def _translate(self, position: int, direction: Action) -> int:
     rows, columns = self._rows, self._columns
     row, column = row_col(position, columns)
     row_offset, column_offset = direction.to_row_col()
     row = (row + row_offset) % rows
     column = (column + column_offset) % columns
     return row * columns + column
コード例 #5
0
ファイル: geese_env.py プロジェクト: yonsweng/hungry-geese
 def step(self, action):
     action += self.action_offset
     self.act_prev[0] = action
     obs, reward, done, info = self.trainer.step(Action(action).name)
     self.obs_prev = self.obs_backup
     self.obs_backup = obs
     reward = self.process_reward(obs, done)
     obs = self.process_obs(obs)
     return obs, reward, done, info
コード例 #6
0
def get_valid_moves(state, player, action_size):
    moves = np.ones(action_size)

    if state[0].observation.step == 0:
        return moves

    prev_action = str_to_action(state[player]['action'])
    invalid_action = Action.opposite(prev_action)
    moves[invalid_action.value - 1] = 0
    return moves
コード例 #7
0
ファイル: geese_env.py プロジェクト: yonsweng/hungry-geese
 def opponent(self, obs, conf):
     obs_index = obs.index
     obs = self.process_obs(obs)
     action, _ = self.past_models[obs_index - 1].predict(obs)
     action += self.action_offset
     act_oppo = (self.act_prev[obs_index] + 1) % 4 + 1 \
         if self.act_prev[obs_index] is not None else 0
     if action == act_oppo:
         actions = [a for a in range(1, 5) if a != act_oppo]
         action = actions[random.randrange(len(actions))]
     self.act_prev[obs_index] = action
     return Action(action).name
コード例 #8
0
def get_board(obs, prev_actions, args):
    '''
    Channels of state:
        geese curr_heads  0,  1,  2,  3
        geese bodies      4,  5,  6,  7
        geese tips        8,  9, 10, 11
        geese prev_heads 12, 13, 14, 15
        food             16
    '''
    board = np.zeros(
        (args.numAgents * 4 + 1, args.boardSize[0] * args.boardSize[1]),
        np.uint8)

    for i, goose in enumerate(obs.geese):
        # head position
        for head_pos in goose[:1]:
            board[0 + (i - obs.index) % args.numAgents, head_pos] = 1

        # tip position
        for tip_pos in goose[-1:]:
            board[args.numAgents + (i - obs.index) % args.numAgents,
                  tip_pos] = 1

        # whole position
        for body_pos in goose[1:]:
            board[args.numAgents * 2 + (i - obs.index) % args.numAgents,
                  body_pos] = 1

        # previous head position
        for head_pos in goose[:1]:
            if prev_actions[i] is not None:
                prev_action = str_to_action(prev_actions[i])
                opposite_action = Action.opposite(prev_action)
                prev_head_pos = adjacent_positions(
                    head_pos, args.boardSize[1],
                    args.boardSize[0])[opposite_action.value - 1]
                board[args.numAgents * 3 + (i - obs.index) % args.numAgents,
                      prev_head_pos] = 1

    for food_pos in obs.food:
        board[-1, food_pos] = 1

    return board.reshape(-1, args.boardSize[0], args.boardSize[1])
コード例 #9
0
    def get_board(self, obs, player):
        '''
        Channels of state:
            geese curr_heads  0,  1,  2,  3
            geese bodies      4,  5,  6,  7
            geese tips        8,  9, 10, 11
            geese prev_heads 12, 13, 14, 15
            food 16
        '''
        board = np.zeros((self.num_agents * 4 + 1, self.config.rows * self.config.columns), np.uint8)

        for i, goose in enumerate(obs.geese):
            # head position
            for head_pos in goose[:1]:
                board[0 + (i - player) % self.num_agents, head_pos] = 1

            # tip position
            for tip_pos in goose[-1:]:
                board[4 + (i - player) % self.num_agents, tip_pos] = 1

            # whole position
            for body_pos in goose[1:]:
                board[8 + (i - player) % self.num_agents, body_pos] = 1

            # previous head position
            for head_pos in goose[:1]:
                if obs.step > 0:
                    opposite_action = Action.opposite(self.str_to_action[prev_actions[i]])
                    prev_head_pos = adjacent_positions(head_pos, config.columns, config.rows)[opposite_action.value - 1]

                    board[12 + (i - player) % self.num_agents, prev_head_pos] = 1

        for food_pos in obs.food:
            board[-1, food_pos] = 1

        return board.reshape(-1, 7, 11)
コード例 #10
0
 def getValidMoves(self, player):
     prev_action = self.str_to_action[self.prev_actions[player]]
     oppo_action = Action.opposite(prev_action).value
     valids = [1] * len(Action)
     valids[oppo_action - 1] = 0
     return np.array(valids)
コード例 #11
0
    def search(self, env, prev_actions, remaining, rank):
        """
        This function performs one iteration of MCTS. It is recursively called
        till a leaf node is found. The action chosen at each node is one that
        has the maximum upper confidence bound as in the paper.

        Once a leaf node is found, the neural network is called to return an
        initial policy P and a value v for the state. This value is propagated
        up the search path. In case the leaf node is a terminal state, the
        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
        updated.

        NOTE: the return values are the negative of the value of the current
        state. This is done since v is in [-1,1] and if v is the value of a
        state for the current player, then its value is -v for the other player.

        Returns:
            v: the negative of the value of the current canonicalBoard
        """
        state = env.state
        obs = state[0].observation
        board = get_board(obs, prev_actions, self.args)
        s = bytes(obs.step) + board.tostring()

        if s not in self.Es:
            self.Es[s] = get_reward(obs, 0, self.args.numAgents)
        if self.Es[s] is not None:
            # terminal node
            return self.Es[s]

        # predict players' action
        actions = []
        pis, vs = self.nnet.predicts(board, rank % self.args.n_gpus)
        for i, pi in enumerate(pis):
            action = select_action(pi, prev_actions[i])
            actions.append(action)
        v = vs[0]

        if s not in self.Ps:
            # leaf node
            self.Ps[s], v = pis[0], v
            valids = get_valid_moves(state, 0, self.args.actionSize)
            self.Ps[s] = self.Ps[s] * valids  # masking invalid moves
            sum_Ps_s = np.sum(self.Ps[s])
            if sum_Ps_s > 0:
                self.Ps[s] /= sum_Ps_s  # renormalize
            else:
                # if all valid moves were masked make all valid moves equally probable

                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
                log.error("All valid moves were masked, doing a workaround.")
                self.Ps[s] = self.Ps[s] + valids
                self.Ps[s] /= np.sum(self.Ps[s])

            self.Vs[s] = valids
            self.Ns[s] = 0
            # return v

        if remaining == 0:
            return v

        valids = self.Vs[s]
        cur_best = -float('inf')
        best_act = -1

        # pick the action with the highest upper confidence bound
        for a in range(self.args.actionSize):
            if valids[a]:
                if (s, a) in self.Qsa:
                    u = self.Qsa[
                        (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
                            self.Ns[s]) / (1 + self.Nsa[(s, a)])
                else:
                    u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
                        self.Ns[s] + EPS)  # Q = 0 ?

                if u > cur_best:
                    cur_best = u
                    best_act = a
        a = best_act
        actions[0] = Action(a + 1).name

        env.step(actions)

        v = self.search(env, actions, remaining - 1, rank)

        if (s, a) in self.Qsa:
            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[(s, a)] +
                                v) / (self.Nsa[(s, a)] + 1)
            self.Nsa[(s, a)] += 1

        else:
            self.Qsa[(s, a)] = v
            self.Nsa[(s, a)] = 1

        self.Ns[s] += 1
        return v
コード例 #12
0
from kaggle_environments import make

NUM_GRID = (7, 11)
NUM_CHANNEL = 8
NUM_ACT = 4
NUM_GEESE = 4

GAME_PER_GEN = 400
NUM_REPLAY_BUF = 3

NUM_LAMBDA = 0.95
NUM_RAND = 10

STOCK_X = tf.convert_to_tensor(np.zeros((*NUM_GRID, NUM_CHANNEL)),
                               dtype='int8')
STOCK_ACT = [Action(i + 1) for i in range(NUM_ACT)]


class Block(tf.keras.layers.Layer):
    def __init__(self, flt, **kwargs):
        super(Block, self).__init__(**kwargs)

        self.conv_0 = tf.keras.layers.Conv2D(flt, 3, use_bias=False)
        self.conv_1 = tf.keras.layers.Conv2D(flt, 3, use_bias=False)
        self.conv_2 = tf.keras.layers.Conv2D(flt, 1)

        self.bn_0 = tf.keras.layers.BatchNormalization()
        self.bn_1 = tf.keras.layers.BatchNormalization()

    def call(self, inp, training=False):
        x = inp
コード例 #13
0
def translate(position: int, direction: Action, columns: int, rows: int):
    row, column = row_col(position, columns)
    row_offset, column_offset = direction.to_row_col()
    row = (row + row_offset) % rows
    column = (column + column_offset) % columns
    return row * columns + column