def __init__(self, config={}): super(YanivEnv).__init__() conf = DEFAULT_GAME_CONFIG.copy() conf.update(config) self.config = conf self.player_step_fn = conf.pop("player_step_fn", {}) self.single_step = self.config.get("single_step", True) self.obs_scheme = self.config.get("observation_scheme", 0) self.num_players = self.config.get("n_players") self.state_n_players = self.config.get("state_n_players") self.step_reward = self.config.get("step_reward", 0) self.game = Game(single_step_actions=self.single_step, num_players=self.num_players) self.game.configure(self.config) self.action_space = Discrete(self.game.get_action_num()) self.observation_space = Dict({ "action_mask": Box(0, 1, shape=(self.action_space.n, )), "state": Box(shape=(self._get_state_shape(), ), low=0, high=1, dtype=int), }) self.reward_range = (-1.0, 1.0) self.timestep = 0
def test_init_game(self): game = Game() state, current_player = game.init_game() self.assertEqual(current_player, 0) for player in game.players: self.assertEqual(len(player.hand), utils.INITIAL_NUMBER_OF_CARDS) self.assertEqual(len(game.round.discard_pile), 1) self.assertEqual(len(game.round.discard_pile[0]), 1)
def __init__(self, config={}): self.name = "yaniv" self.single_step = config.get("single_step_actions", False) self.game = Game(single_step_actions=self.single_step) self.default_game_config = DEFAULT_GAME_CONFIG self.reward_func = calculate_reward # configure game super().__init__(config) self.state_shape = [266] _game_config = self.default_game_config.copy() for key in config: if key in _game_config: _game_config[key] = config[key] self.game.configure(_game_config)
def test_end_after_n_steps(self): game = Game() game.init_game() game._early_end_reward = -1 game._end_after_n_steps = 100 while not game.is_over(): legal_actions = game.get_legal_actions() if utils.DRAW_CARD_ACTION in legal_actions: action = utils.DRAW_CARD_ACTION else: action = np.random.choice( [a for a in legal_actions if a != utils.YANIV_ACTION]) _, _ = game.step(action) self.assertLessEqual(len(game.actions), 100) # should take 84 actions self.assertEqual(len(game.actions), 100) for p in game.get_payoffs(): self.assertEqual(p, -1)
def __init__(self, single_step=True, config={}): super(YanivEnv).__init__() self.single_step = single_step self.num_players = 2 self.game = Game(single_step_actions=single_step, num_players=self.num_players) conf = DEFAULT_GAME_CONFIG.copy() conf.update(config) self.game.configure(conf) single_aspace = spaces.Discrete(self.game.get_action_num()) self.action_space = spaces.Tuple( [single_aspace for _ in range(self.num_players)]) single_obs = spaces.Box(shape=(266, ), low=0, high=1, dtype=int) self.observation_space = spaces.Tuple( [single_obs for _ in range(self.num_players)]) self.timestep = 0 self.current_player = None
def test_end_after_n_deck_replacements(self): game = Game() game.init_game() game._early_end_reward = -1 game._end_after_n_deck_replacements = 1 while not game.is_over(): legal_actions = game.get_legal_actions() if utils.DRAW_CARD_ACTION in legal_actions: action = utils.DRAW_CARD_ACTION else: action = np.random.choice( [a for a in legal_actions if a != utils.YANIV_ACTION]) _, _ = game.step(action) self.assertLessEqual(len(game.actions), 84) # should take 84 actions self.assertEqual(game.round.deck_replacements, 1) self.assertEqual(len(game.actions), 84) self.assertEqual(game.actions[-1], "draw_card") for p in game.get_payoffs(): self.assertEqual(p, -1)
def test_get_player_num(self): game = Game() player_num = game.get_player_num() self.assertEqual(player_num, 2) game = Game(3) player_num = game.get_player_num() self.assertEqual(player_num, 3)
def test_pickup_discard(self): game = Game() _, current_player = game.init_game() card = game.players[current_player].hand[0] action = str(card) _, current_player = game.step(action) _, next_player = game.step(utils.DRAW_CARD_ACTION) _, next_player = game.step( np.random.choice( [a for a in game.get_legal_actions() if a != "yaniv"])) _, current_player = game.step(utils.PICKUP_TOP_DISCARD_ACTION) self.assertIn(card, game.players[next_player].hand)
def test_proceed_game(self): game = Game() game.init_game() while not game.is_over(): legal_actions = game.get_legal_actions() action = np.random.choice(legal_actions) self.assertIn(action, utils.ACTION_LIST) _, _ = game.step(action) if game.round.winner != -1: self.assertEqual(game.actions[-1], "yaniv") else: self.assertEqual(game.actions[-1], "draw_card")
def test_step(self): game = Game() _, current_player = game.init_game() # discard first action = np.random.choice( [a for a in game.get_legal_actions() if a != "yaniv"]) self.assertNotIn(action, utils.pickup_actions) self.assertIn(action, utils.ACTION_LIST) _, next_player = game.step(action) self.assertEqual(next_player, current_player) # then pickup action = np.random.choice(game.get_legal_actions()) self.assertIn(action, utils.pickup_actions) _, next_player = game.step(action) self.assertEqual(next_player, (current_player + 1) % game.get_player_num())
def test_get_payoffs(self): game = Game() game.init_game() while not game.is_over(): actions = game.get_legal_actions() action = np.random.choice(actions) state, _ = game.step(action) payoffs = game.get_payoffs() for player in game.players: player_id = player.get_player_id() payoff = payoffs[player_id] if game.round.winner == -1: self.assertEqual(payoff, -1) elif game.round.winner == player_id: self.assertEqual(payoff, 1) else: self.assertEqual(payoff, -(game.round.scores[player_id] / 50))
class YanivEnv(MultiAgentEnv): def __init__(self, config={}): super(YanivEnv).__init__() conf = DEFAULT_GAME_CONFIG.copy() conf.update(config) self.config = conf self.player_step_fn = conf.pop("player_step_fn", {}) self.single_step = self.config.get("single_step", True) self.obs_scheme = self.config.get("observation_scheme", 0) self.num_players = self.config.get("n_players") self.state_n_players = self.config.get("state_n_players") self.step_reward = self.config.get("step_reward", 0) self.game = Game(single_step_actions=self.single_step, num_players=self.num_players) self.game.configure(self.config) self.action_space = Discrete(self.game.get_action_num()) self.observation_space = Dict({ "action_mask": Box(0, 1, shape=(self.action_space.n, )), "state": Box(shape=(self._get_state_shape(), ), low=0, high=1, dtype=int), }) self.reward_range = (-1.0, 1.0) self.timestep = 0 @property def current_player(self): return self.game.round.current_player @property def current_player_string(self): return self._get_player_string(self.current_player) def reset(self): self.game.init_game() self.timestep = 0 self.step_player_fn() return { self.current_player_string: self._get_players_observation(self.current_player) } def step_player_fn(self): while self.current_player_string in self.player_step_fn and not self.game.is_over( ): self.player_step_fn[self.current_player_string](self) def step(self, action_dict, raw_action=False): action = action_dict[self.current_player_string] if not raw_action: action = self._decode_action(action) self.game.step(action) self.step_player_fn() done = self.game.is_over() dones = {p: done for p in self._get_players()} dones["__all__"] = done if done: payoffs = self.game.get_payoffs() rewards = { self._get_player_string(i): payoffs[i] for i in range(self.num_players) } observations = { p: { "state": np.zeros(self.observation_space.spaces["state"].shape), "action_mask": np.zeros(self.action_space.n), } for p in self._get_players() } else: rewards = {self.current_player_string: self.step_reward} observations = { self.current_player_string: self._get_players_observation(self.current_player) } infos = {p: {} for p in self._get_players()} self.timestep += 1 return ( observations, rewards, dones, {}, ) def _decode_action(self, action_id): if self.single_step: return utils.JOINED_ACTION_LIST[action_id] else: return utils.ACTION_LIST[action_id] def _get_observations(self): observations = {} for i in range(self.num_players): # apppaarently rllib doens't do action for if no obs ret if i != self.current_player: continue obs = self._get_players_observation(i) observations[self._get_player_string(i)] = obs return observations def _get_players_observation(self, id): if self.obs_scheme == 0: state = self._extract_state_0(id) elif self.obs_scheme == 1: state = self._extract_state_1(id) else: raise Exception("obs scheme not") return { "state": state, "action_mask": self._get_action_mask(id), } def _get_action_mask(self, player_id): if player_id != self.current_player: return np.zeros(self.action_space.n) legal_actions = self.game.get_legal_actions() if self.game._single_step_actions: legal_ids = [ utils.JOINED_ACTION_SPACE[action] for action in legal_actions ] else: legal_ids = [ utils.ACTION_SPACE[action] for action in legal_actions ] action_mask = np.zeros(self.action_space.n) np.put(action_mask, ind=legal_ids, v=1) return action_mask def _extract_state_0(self, player_id): if self.game.is_over(): return np.zeros(self._get_state_shape()) discard_pile = self.game.round.discard_pile if self.game.round.discarding: last_discard = discard_pile[-1] else: last_discard = discard_pile[-2] available_discard = set([last_discard[0], last_discard[-1]]) deadcards = [ c for d in discard_pile for c in d if c not in available_discard ] current_player = self.game.players[player_id] known_cards = [] hand_sizes = [] for i in range(self.state_n_players - 1): next_id = self.game.round._get_next_player(player_id + i) next_player = self.game.players[next_id] known_cards.append(self.game.round.known_cards[next_id]) opponent_hand_size = np.zeros(6) opponent_hand_size[len(next_player.hand)] = 1 hand_sizes.append(opponent_hand_size) card_obs = [ current_player.hand, available_discard, deadcards, *known_cards, ] if self.config["use_unkown_cards_in_state"]: unknown_cards = self.game.round.dealer.deck + [ c for c in next_player.hand if c not in known_cards ] card_obs.append(unknown_cards) card_obs = np.ravel(list(map(utils.encode_cards, card_obs))) hand_sizes = np.ravel(hand_sizes) obs = np.concatenate((card_obs, opponent_hand_size)) return obs def _extract_state_1(self, player_id): if self.game.is_over(): return np.zeros((262, )) discard_pile = self.game.round.discard_pile if self.game.round.discarding: last_discard = discard_pile[-1] else: last_discard = discard_pile[-2] top_card = last_discard[0] bottom_card = last_discard[-1] deadcards = [ c for d in discard_pile for c in d if c not in (top_card, bottom_card) ] current_player = self.game.players[player_id] next_id = self.game.round._get_next_player(player_id) next_player = self.game.players[next_id] known_cards = self.game.round.known_cards[next_id] hand_enc = np.zeros(85) known_enc = np.zeros(85) if len(current_player.hand) > 0: hand_one_hot = utils.one_hot_encode_cards(current_player.hand) hand_enc[:hand_one_hot.shape[0]] = hand_one_hot if len(known_cards) > 0: known_one_hot = utils.one_hot_encode_cards(known_cards) known_enc[:known_one_hot.shape[0]] = known_one_hot opponent_hand_size = np.zeros(6) opponent_hand_size[len(next_player.hand)] = 1 obs = [ hand_enc, known_enc, opponent_hand_size, utils.one_hot_encode_card(top_card), utils.one_hot_encode_card(bottom_card), utils.encode_cards(deadcards) if self.config["use_dead_cards_in_state"] else [], ] obs = np.concatenate(obs) return obs def _get_player_string(self, id): return "player_{}".format(id) def _get_players(self): return [self._get_player_string(i) for i in range(self.num_players)] def _get_state_shape(self): if self.obs_scheme == 0: shape = 162 if self.config["use_unkown_cards_in_state"]: shape += 52 if self.state_n_players > 1: shape += 52 * (self.state_n_players - 1) return shape elif self.obs_scheme == 1: shape = 210 if self.config["use_dead_cards_in_state"]: shape += 52 return shape else: raise Exception("obs scheme not")
class YanivEnv(Env): def __init__(self, config={}): self.name = "yaniv" self.single_step = config.get("single_step_actions", False) self.game = Game(single_step_actions=self.single_step) self.default_game_config = DEFAULT_GAME_CONFIG self.reward_func = calculate_reward # configure game super().__init__(config) self.state_shape = [266] _game_config = self.default_game_config.copy() for key in config: if key in _game_config: _game_config[key] = config[key] self.game.configure(_game_config) def _extract_state(self, state): if self.game.is_over(): return { "obs": np.zeros(self.state_shape), "legal_actions": self._get_legal_actions(), } discard_pile = self.game.round.discard_pile if self.game.round.discarding: last_discard = discard_pile[-1] else: last_discard = discard_pile[-2] available_discard = set([last_discard[0], last_discard[-1]]) deadcards = [ c for d in discard_pile for c in d if c not in available_discard ] current_player = self.game.players[self.game.round.current_player] next_player = self.game.players[self.game.round.get_next_player()] known_cards = self.game.round.known_cards[0] unknown_cards = self.game.round.dealer.deck + [ c for c in next_player.hand if c not in known_cards ] card_obs = [ current_player.hand, available_discard, deadcards, known_cards, unknown_cards, ] card_obs = np.ravel(list(map(utils.encode_cards, card_obs))) opponent_hand_size = np.zeros(6) opponent_hand_size[len(next_player.hand)] = 1 obs = np.concatenate((card_obs, opponent_hand_size)) extracted_state = { "obs": obs, "legal_actions": self._get_legal_actions() } if self.allow_raw_data: extracted_state["raw_obs"] = state extracted_state["raw_legal_actions"] = [ a for a in state["legal_actions"] ] if self.record_action: extracted_state["action_record"] = self.action_recorder return extracted_state def get_payoffs(self): return np.array(self.game.get_payoffs()) def _decode_action(self, action_id): if self.single_step: return utils.JOINED_ACTION_LIST[action_id] else: return utils.ACTION_LIST[action_id] # legal_ids = self._get_legal_actions() # if action_id in legal_ids: # return utils.ACTION_LIST[action_id] # else: # print("Tried non legal action", action_id, utils.ACTION_LIST[action_id], legal_ids, [utils.ACTION_LIST[a] for a in legal_ids]) # return utils.ACTION_LIST[np.random.choice(legal_ids)] def _get_legal_actions(self): legal_actions = self.game.get_legal_actions() if self.game._single_step_actions: legal_ids = [ utils.JOINED_ACTION_SPACE[action] for action in legal_actions ] else: legal_ids = [ utils.ACTION_SPACE[action] for action in legal_actions ] return legal_ids def _load_model(self): """Load pretrained/rule model Returns: model (Model): A Model object """ raise NotImplementedError def step(self, action, raw_action=False): if not raw_action: action = self._decode_action(action) if self.single_agent_mode: return self._single_agent_step(action) self.timestep += 1 # Record the action for human interface if self.record_action: self.action_recorder.append([self.get_player_id(), action]) next_state, player_id = self.game.step(action) return self._extract_state(next_state), player_id def run(self, is_training=False): """ Run a complete game, either for evaluation or training RL agent. Args: is_training (boolean): True if for training purpose. Returns: (tuple) Tuple containing: (list): A list of trajectories generated from the environment. (list): A list payoffs. Each entry corresponds to one player. Note: The trajectories are 3-dimension list. The first dimension is for different players. The second dimension is for different transitions. The third dimension is for the contents of each transiton """ if self.single_agent_mode: raise ValueError("Run in single agent not allowed.") trajectories = [[] for _ in range(self.player_num)] state, player_id = self.reset() # Loop to play the game trajectories[player_id].append(state) while not self.is_over(): # Agent plays if is_training: action = self.agents[player_id].step(state) else: action, _ = self.agents[player_id].eval_step(state) # Environment steps next_state, next_player_id = self.step( action, self.agents[player_id].use_raw) # Save action trajectories[player_id].append(action) decoded_action = action if not self.agents[player_id].use_raw: decoded_action = self._decode_action(action) trajectories[player_id].append( self.reward_func(state, next_state, decoded_action)) # Set the state and player state = next_state player_id = next_player_id # Save state. if not self.game.is_over(): trajectories[player_id].append(state) # Add a final state to all the players for player_id in range(self.player_num): state = self.get_state(player_id) trajectories[player_id].append(state) # Payoffs payoffs = self.get_payoffs() # Reorganize the trajectories trajectories = reorganize(trajectories, payoffs) return trajectories, payoffs
class YanivEnv(gym.Env): metadata = {"render.modes": ["human"], "name": "Yaniv-v0"} def __init__(self, single_step=True, config={}): super(YanivEnv).__init__() self.single_step = single_step self.num_players = 2 self.game = Game(single_step_actions=single_step, num_players=self.num_players) conf = DEFAULT_GAME_CONFIG.copy() conf.update(config) self.game.configure(conf) single_aspace = spaces.Discrete(self.game.get_action_num()) self.action_space = spaces.Tuple( [single_aspace for _ in range(self.num_players)]) single_obs = spaces.Box(shape=(266, ), low=0, high=1, dtype=int) self.observation_space = spaces.Tuple( [single_obs for _ in range(self.num_players)]) self.timestep = 0 self.current_player = None def reset(self): state, player_id = self.game.init_game() self.current_player = player_id self.timestep = 0 return self._get_observations() # return self._get_observations(), { # "current_player": player_id, # "legal_actions": self._get_legal_actions(), # } def step(self, action, raw_action=False): if not raw_action: action = self._decode_action(action) self.timestep += 1 _, player_id = self.game.step(action) done = self.game.is_over() if done: rewards = self.game.get_payoffs() else: rewards = [-0.1 for _ in range(self.num_players)] self.current_player = player_id return ( self._get_observations(), rewards, done, { "current_player": player_id, "legal_actions": self._get_legal_actions(), }, ) def _get_observations(self): observations = [] for i in range(self.num_players): obs = self._extract_state(i) observations.append(obs) return observations def _decode_action(self, action_id): if self.single_step: return utils.JOINED_ACTION_LIST[action_id] else: return utils.ACTION_LIST[action_id] def _get_legal_actions(self): legal_actions = self.game.get_legal_actions() if self.game._single_step_actions: legal_ids = [ utils.JOINED_ACTION_SPACE[action] for action in legal_actions ] else: legal_ids = [ utils.ACTION_SPACE[action] for action in legal_actions ] return legal_ids def get_moves(self): return self._get_legal_actions() def _extract_state(self, player_id): if self.game.is_over(): return np.zeros(self.observation_space.spaces[0].shape) discard_pile = self.game.round.discard_pile if self.game.round.discarding: last_discard = discard_pile[-1] else: last_discard = discard_pile[-2] available_discard = set([last_discard[0], last_discard[-1]]) deadcards = [ c for d in discard_pile for c in d if c not in available_discard ] current_player = self.game.players[player_id] next_player = self.game.players[self.game.round._get_next_player( player_id)] known_cards = self.game.round.known_cards[player_id] unknown_cards = self.game.round.dealer.deck + [ c for c in next_player.hand if c not in known_cards ] card_obs = [ current_player.hand, available_discard, deadcards, known_cards, unknown_cards, ] card_obs = np.ravel(list(map(utils.encode_cards, card_obs))) opponent_hand_size = np.zeros(6) opponent_hand_size[len(next_player.hand)] = 1 obs = np.concatenate((card_obs, opponent_hand_size)) return obs def render(self, mode="human"): state = self.game.get_state(self.current_player) _print_state(state, [])
def test_get_action_num(self): game = Game() action_num = game.get_action_num() self.assertEqual(action_num, 488)