def agent(obs, config, prev_actions=None): if prev_actions is None: try: prev_actions = agent.prev_actions except AttributeError: agent.prev_actions = [None] * args.numAgents prev_actions = agent.prev_actions player = obs.index board = get_board(obs, prev_actions, args) board = get_player_board(board, player, args.numAgents) pi, _ = agent.net.predict(board) # remove invalid action prev_action = prev_actions[player] if prev_action is not None: prev_action = str_to_action(prev_action) oppo_action = Action.opposite(prev_action).value pi[oppo_action - 1] = 0 pi /= pi.sum() action = np.argmax(pi) + 1 action = Action(action).name prev_actions[player] = action return action
def agent_strategy(self, observation, configuration): state = self.state_from_world() #Process reward for growing reward = len( self.my_body) + 2 * self.step #Geese really like pizza!!! 8-) self.previous_length = len(self.my_body) self.process_reward(reward) #Choose action action = self.epsilon_greedy_choose_action(state) #Apply some common sense like colliding is bad... ;-) cs_reward = self.common_sense_after_move_choosen(action) if cs_reward < 0: #update q-table self.process_reward(reward, previous_state=state, last_action=action, last_action_index=self.actions.index(action)) #choose new greedy risk averse valid action random_action = self.strategy_greedy_avoid_risk( observation, configuration) #update internal action attributes aux = [(action, index) for index, action in enumerate(Action) if action.name == random_action][0] self.last_action = aux[0] self.last_action_index = aux[1] action = self.last_action print(f'q-agent q_table{self.q_table}', flush=True) self.previous_action = action return Action(action).name
def select_action(pi, prev_action): ''' Args: pi: np.array(4) prev_action: one of 0, 1, 2, 3 Return: action: str ''' if prev_action is not None: prev_action = str_to_action(prev_action) invalid_action = Action.opposite(prev_action) pi[invalid_action.value - 1] = 0 pi /= np.sum(pi) action_num = np.random.choice(len(pi), p=pi) + 1 action = Action(action_num).name return action
def _translate(self, position: int, direction: Action) -> int: rows, columns = self._rows, self._columns row, column = row_col(position, columns) row_offset, column_offset = direction.to_row_col() row = (row + row_offset) % rows column = (column + column_offset) % columns return row * columns + column
def step(self, action): action += self.action_offset self.act_prev[0] = action obs, reward, done, info = self.trainer.step(Action(action).name) self.obs_prev = self.obs_backup self.obs_backup = obs reward = self.process_reward(obs, done) obs = self.process_obs(obs) return obs, reward, done, info
def get_valid_moves(state, player, action_size): moves = np.ones(action_size) if state[0].observation.step == 0: return moves prev_action = str_to_action(state[player]['action']) invalid_action = Action.opposite(prev_action) moves[invalid_action.value - 1] = 0 return moves
def opponent(self, obs, conf): obs_index = obs.index obs = self.process_obs(obs) action, _ = self.past_models[obs_index - 1].predict(obs) action += self.action_offset act_oppo = (self.act_prev[obs_index] + 1) % 4 + 1 \ if self.act_prev[obs_index] is not None else 0 if action == act_oppo: actions = [a for a in range(1, 5) if a != act_oppo] action = actions[random.randrange(len(actions))] self.act_prev[obs_index] = action return Action(action).name
def get_board(obs, prev_actions, args): ''' Channels of state: geese curr_heads 0, 1, 2, 3 geese bodies 4, 5, 6, 7 geese tips 8, 9, 10, 11 geese prev_heads 12, 13, 14, 15 food 16 ''' board = np.zeros( (args.numAgents * 4 + 1, args.boardSize[0] * args.boardSize[1]), np.uint8) for i, goose in enumerate(obs.geese): # head position for head_pos in goose[:1]: board[0 + (i - obs.index) % args.numAgents, head_pos] = 1 # tip position for tip_pos in goose[-1:]: board[args.numAgents + (i - obs.index) % args.numAgents, tip_pos] = 1 # whole position for body_pos in goose[1:]: board[args.numAgents * 2 + (i - obs.index) % args.numAgents, body_pos] = 1 # previous head position for head_pos in goose[:1]: if prev_actions[i] is not None: prev_action = str_to_action(prev_actions[i]) opposite_action = Action.opposite(prev_action) prev_head_pos = adjacent_positions( head_pos, args.boardSize[1], args.boardSize[0])[opposite_action.value - 1] board[args.numAgents * 3 + (i - obs.index) % args.numAgents, prev_head_pos] = 1 for food_pos in obs.food: board[-1, food_pos] = 1 return board.reshape(-1, args.boardSize[0], args.boardSize[1])
def get_board(self, obs, player): ''' Channels of state: geese curr_heads 0, 1, 2, 3 geese bodies 4, 5, 6, 7 geese tips 8, 9, 10, 11 geese prev_heads 12, 13, 14, 15 food 16 ''' board = np.zeros((self.num_agents * 4 + 1, self.config.rows * self.config.columns), np.uint8) for i, goose in enumerate(obs.geese): # head position for head_pos in goose[:1]: board[0 + (i - player) % self.num_agents, head_pos] = 1 # tip position for tip_pos in goose[-1:]: board[4 + (i - player) % self.num_agents, tip_pos] = 1 # whole position for body_pos in goose[1:]: board[8 + (i - player) % self.num_agents, body_pos] = 1 # previous head position for head_pos in goose[:1]: if obs.step > 0: opposite_action = Action.opposite(self.str_to_action[prev_actions[i]]) prev_head_pos = adjacent_positions(head_pos, config.columns, config.rows)[opposite_action.value - 1] board[12 + (i - player) % self.num_agents, prev_head_pos] = 1 for food_pos in obs.food: board[-1, food_pos] = 1 return board.reshape(-1, 7, 11)
def getValidMoves(self, player): prev_action = self.str_to_action[self.prev_actions[player]] oppo_action = Action.opposite(prev_action).value valids = [1] * len(Action) valids[oppo_action - 1] = 0 return np.array(valids)
def search(self, env, prev_actions, remaining, rank): """ This function performs one iteration of MCTS. It is recursively called till a leaf node is found. The action chosen at each node is one that has the maximum upper confidence bound as in the paper. Once a leaf node is found, the neural network is called to return an initial policy P and a value v for the state. This value is propagated up the search path. In case the leaf node is a terminal state, the outcome is propagated up the search path. The values of Ns, Nsa, Qsa are updated. NOTE: the return values are the negative of the value of the current state. This is done since v is in [-1,1] and if v is the value of a state for the current player, then its value is -v for the other player. Returns: v: the negative of the value of the current canonicalBoard """ state = env.state obs = state[0].observation board = get_board(obs, prev_actions, self.args) s = bytes(obs.step) + board.tostring() if s not in self.Es: self.Es[s] = get_reward(obs, 0, self.args.numAgents) if self.Es[s] is not None: # terminal node return self.Es[s] # predict players' action actions = [] pis, vs = self.nnet.predicts(board, rank % self.args.n_gpus) for i, pi in enumerate(pis): action = select_action(pi, prev_actions[i]) actions.append(action) v = vs[0] if s not in self.Ps: # leaf node self.Ps[s], v = pis[0], v valids = get_valid_moves(state, 0, self.args.actionSize) self.Ps[s] = self.Ps[s] * valids # masking invalid moves sum_Ps_s = np.sum(self.Ps[s]) if sum_Ps_s > 0: self.Ps[s] /= sum_Ps_s # renormalize else: # if all valid moves were masked make all valid moves equally probable # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else. # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process. log.error("All valid moves were masked, doing a workaround.") self.Ps[s] = self.Ps[s] + valids self.Ps[s] /= np.sum(self.Ps[s]) self.Vs[s] = valids self.Ns[s] = 0 # return v if remaining == 0: return v valids = self.Vs[s] cur_best = -float('inf') best_act = -1 # pick the action with the highest upper confidence bound for a in range(self.args.actionSize): if valids[a]: if (s, a) in self.Qsa: u = self.Qsa[ (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt( self.Ns[s]) / (1 + self.Nsa[(s, a)]) else: u = self.args.cpuct * self.Ps[s][a] * math.sqrt( self.Ns[s] + EPS) # Q = 0 ? if u > cur_best: cur_best = u best_act = a a = best_act actions[0] = Action(a + 1).name env.step(actions) v = self.search(env, actions, remaining - 1, rank) if (s, a) in self.Qsa: self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[(s, a)] + v) / (self.Nsa[(s, a)] + 1) self.Nsa[(s, a)] += 1 else: self.Qsa[(s, a)] = v self.Nsa[(s, a)] = 1 self.Ns[s] += 1 return v
from kaggle_environments import make NUM_GRID = (7, 11) NUM_CHANNEL = 8 NUM_ACT = 4 NUM_GEESE = 4 GAME_PER_GEN = 400 NUM_REPLAY_BUF = 3 NUM_LAMBDA = 0.95 NUM_RAND = 10 STOCK_X = tf.convert_to_tensor(np.zeros((*NUM_GRID, NUM_CHANNEL)), dtype='int8') STOCK_ACT = [Action(i + 1) for i in range(NUM_ACT)] class Block(tf.keras.layers.Layer): def __init__(self, flt, **kwargs): super(Block, self).__init__(**kwargs) self.conv_0 = tf.keras.layers.Conv2D(flt, 3, use_bias=False) self.conv_1 = tf.keras.layers.Conv2D(flt, 3, use_bias=False) self.conv_2 = tf.keras.layers.Conv2D(flt, 1) self.bn_0 = tf.keras.layers.BatchNormalization() self.bn_1 = tf.keras.layers.BatchNormalization() def call(self, inp, training=False): x = inp
def translate(position: int, direction: Action, columns: int, rows: int): row, column = row_col(position, columns) row_offset, column_offset = direction.to_row_col() row = (row + row_offset) % rows column = (column + column_offset) % columns return row * columns + column