def train_mahjong(): # Make environment and enable human mode env = rlcard.make('mahjong', config={'seed': 0, 'allow_step_back': True}) eval_env = rlcard.make('mahjong', config={'seed': 0}) # Set the iterations numbers and how frequently we evaluate the performance and save model evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 10000 # The paths for saving the logs and learning curves log_dir = './experiments/mahjong_emccfr_result/' # Set a global seed set_global_seed(0) # Initilize CFR Agent model_path = 'models/mahjong_oscfr' agent = OutcomeSampling_CFR(env, model_path=model_path) agent.load() # If we have saved model, we first load the model # Evaluate CFR against pre-trained NFSP eval_env.set_agents([agent, models.load('mahjong-nfsp').agents[0]]) # Init a Logger to plot the learning curve logger = Logger(log_dir) for episode in range(episode_num): agent.train() print('\rIteration {}'.format(episode), end='') # Evaluate the performance. Play with NFSP agents. if episode % evaluate_every == 0: agent.save() # Save model logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0]) # Close files in the logger logger.close_files() # Plot the learning curve logger.plot('EMCCFR')
def test_config_chips(self): env = rlcard.make('no-limit-holdem', config={ 'game_player_num': 5, 'chips_for_each': [100, 200, 400, 600, 900] }) env.game.init_game() players = env.game.players chips = [] for i in range(5): chips.append(players[i].remained_chips + players[i].in_chips) self.assertEqual(chips, [100, 200, 400, 600, 900])
def __init__(self, seed=None): super().__init__() self.env = rlcard.make('mahjong', config={"seed": seed}) self.agents = ['player_0', 'player_1', 'player_2', 'player_3'] self.num_agents = len(self.agents) self.has_reset = False self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(6, 34, 4), dtype=np.bool) for _ in range(self.num_agents)]) self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)]) self.agent_order = list(self.agents) self._agent_selector = agent_selector(self.agent_order)
def is_deterministic(env_name): env = rlcard.make(env_name) NUM_STEPS = 25 actions = [ random.randrange(env.game.get_num_actions()) for _ in range(NUM_STEPS) ] base_seed = 12941 hashes = [] for rand_iters in range(2): env = rlcard.make(env_name, config={'seed': base_seed}) hashes.append( hash( tuple([ hash_obsevation(obs['obs']) for obs in gather_observations(env, actions, rand_iters) ]))) return hashes[0] == hashes[1]
def __init__(self, seed=None): super().__init__() self.env = rlcard.make('doudizhu', config={"seed": seed}) self.agents = ['landlord_0', 'peasant_0', 'peasant_1'] self.num_agents = len(self.agents) self.has_reset = False self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(6, 5, 15), dtype=np.bool) for _ in range(self.num_agents)]) self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)]) self.agent_order = self.agents self._agent_selector = agent_selector(self.agent_order)
def test_train(self): env = rlcard.make('leduc-holdem', allow_step_back=True) agent = CFRAgent(env) for _ in range(100): agent.train() state = {'obs': np.array([1., 1., 0., 0., 0., 0.]), 'legal_actions': [0,2]} action = agent.eval_step(state) self.assertIn(action, [0, 2])
def test_single_agent_mode(self): env = rlcard.make('leduc-holdem', config={'single_agent_mode':True}) with self.assertRaises(ValueError): env.set_agents([]) with self.assertRaises(ValueError): env.run() state = env.reset() self.assertIsInstance(state, dict) for _ in range(100): state, _, _ = env.step(np.random.choice(state['legal_actions']))
def test_decode_action(self): env = rlcard.make('doudizhu') env.reset() env.game.state['actions'] = ['33366', '33355'] env.game.judger.playable_cards[0] = [ '5', '6', '55', '555', '33366', '33355' ] decoded = env._decode_action(3) self.assertEqual(decoded, '6') env.game.state['actions'] = ['444', '44466', '44455'] decoded = env._decode_action(29) self.assertEqual(decoded, '444')
def test_decode_action(self): env = rlcard.make('simple-doudizhu') env.reset() env.game.state['actions'] = ['888TT', '88899'] env.game.judger.playable_cards[0] = [ '9', 'T', '99', '999', '888TT', '88899' ] decoded = env._decode_action(28) self.assertEqual(decoded, '888TT') env.game.state['actions'] = ['888', '88899', '888TT'] decoded = env._decode_action(14) self.assertEqual(decoded, '888')
def test_decode_action(self): env = rlcard.make('no-limit-holdem') state, _ = env.init_game() for action in state['legal_actions']: decoded = env._decode_action(action) self.assertIn(decoded, env.actions) decoded = env._decode_action(3) self.assertEqual(decoded, 'fold') env.step(0) decoded = env._decode_action(0) self.assertEqual(decoded, 'check')
def test_decode_action(self): env = rlcard.make('no-limit-holdem') state, _ = env.reset() for action in state['legal_actions']: decoded = env._decode_action(action) self.assertIn(decoded, env.actions) decoded = env._decode_action(Action.FOLD.value) self.assertEqual(decoded, Action.FOLD) env.step(0) decoded = env._decode_action(1) self.assertEqual(decoded, Action.CHECK)
def test_run(self): env = rlcard.make('gin-rummy') env.set_agents( [RandomAgent(env.num_actions) for _ in range(env.num_players)]) trajectories, payoffs = env.run(is_training=False) self.assertEqual(len(trajectories), 2) for payoff in payoffs: self.assertLessEqual(-1, payoff) self.assertLessEqual(payoff, 1) trajectories, payoffs = env.run(is_training=True) for payoff in payoffs: self.assertLessEqual(-1, payoff) self.assertLessEqual(payoff, 1)
def __init__(self, seed=None): super().__init__() if seed is not None: np.random.seed(seed) random.seed(seed) self.env = rlcard.make('leduc-holdem', config={"seed": seed}) self.agents = ['player_0', 'player_1'] self.num_agents = len(self.agents) self.has_reset = False self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(36,), dtype=np.bool) for _ in range(self.num_agents)]) self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)]) self.agent_order = self.agents self._agent_selector = agent_selector(self.agent_order)
def test_run(self): env = rlcard.make('uno') env.set_agents([RandomAgent(env.action_num) for _ in range(env.player_num)]) trajectories, payoffs = env.run(is_training=False) self.assertEqual(len(trajectories), 2) total = 0 for payoff in payoffs: total += payoff self.assertEqual(total, 0) trajectories, payoffs = env.run(is_training=True) total = 0 for payoff in payoffs: total += payoff self.assertEqual(total, 0)
def train(): env = rlcard.make('mahjong', {'allow_step_back': True}) # env = rlcard.make('mahjong') # Set the iterations numbers and how frequently we evaluate/save plot evaluate_every = 100 save_plot_every = 1000 evaluate_num = 10000 episode_num = 100000 # The paths for saving the logs and learning curves root_path = './experiments/mahjong_cfr_result/' log_path = root_path + 'log.txt' csv_path = root_path + 'performance.csv' figure_path = root_path + 'figures/' # Set a global seed set_global_seed(0) # Initilize CFR Agent agent = MCCFRAgent(env) # Init a Logger to plot the learning curve logger = Logger(root_path) for episode in range(episode_num + 1): agent.train() print('\rIteration {}'.format(episode), end='') if episode % 5000 == 0: agent.save(episode) # # Evaluate the performance. Play with NFSP agents. # if episode % evaluate_every == 0: # reward = 0 # for eval_episode in range(evaluate_num): # _, payoffs = eval_env.run(is_training=False) # # reward += payoffs[0] # # logger.log('\n########## Evaluation ##########') # logger.log('Iteration: {} Average reward is {}'.format(episode, float(reward)/evaluate_num)) # # # Add point to logger # logger.add_point(x=env.timestep, y=float(reward)/evaluate_num) # # # Make plot # if episode % save_plot_every == 0 and episode > 0: # logger.make_plot(save_path=figure_path+str(episode)+'.png') # Make the final plot logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')
def __init__(self): env = rlcard.make('uno') self.sess1 = tf.compat.v1.Session() global_step = tf.Variable(0, name='global_step', trainable=False) self.agent = DQNAgent(self.sess1, scope='dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, norm_step=norm_step, state_shape=env.state_shape, mlp_layers=[100, 100]) self.sess1.run(tf.global_variables_initializer()) self.saver = tf.train.Saver() self.saver.restore(self.sess1, './experiments/uno_dqn_result/models/model1.ckpt')
def __init__(self, seed=None): super().__init__() if seed is not None: np.random.seed(seed) random.seed(seed) self.env = rlcard.make('no-limit-holdem', config={"seed": seed}) self.agents = ['player_0', 'player_1'] self.num_agents = len(self.agents) self.has_reset = False self.observation_spaces = self._convert_to_dict([spaces.Box(low=np.zeros(54,), high=np.append(np.ones(52,), [100, 100]), dtype=np.float64) for _ in range(self.num_agents)]) self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)]) self.agent_order = self.agents self._agent_selector = agent_selector(self.agent_order)
def test_run(self): env = rlcard.make('doudizhu') env.set_agents( [RandomAgent(env.num_actions) for _ in range(env.num_players)]) trajectories, payoffs = env.run(is_training=False) self.assertEqual(len(trajectories), 3) win = [] for player_id, payoff in enumerate(payoffs): if payoff == 1: win.append(player_id) if len(win) == 1: self.assertEqual(env.game.players[win[0]].role, 'landlord') if len(win) == 2: self.assertEqual(env.game.players[win[0]].role, 'peasant') self.assertEqual(env.game.players[win[1]].role, 'peasant')
def run(self): #import tensorflow as tf self.env = rlcard.make('blackjack') self.sess = tf.Session() agent = DQNAgent(self.sess, scope='sub-dqn' + str(self.index), action_num=self.env.action_num, replay_memory_init_size=memory_init_size, norm_step=norm_step, state_shape=self.env.state_shape, mlp_layers=[10, 10]) self.env.set_agents([agent]) self.sess.run(tf.global_variables_initializer()) # normalize for _ in range(norm_step): trajectories, _ = self.env.run() for ts in trajectories[0]: agent.feed(ts) # Receive instruction to run game and generate trajectories while True: instruction = self.input_queue.get() if instruction is not None: tasks, train_flag, variables, total_t = instruction # For evaluation if not train_flag: agent.total_t = total_t global_vars = [ tf.convert_to_tensor(var) for var in variables ] agent.copy_params_op(global_vars) for _ in range(tasks): _, payoffs = self.env.run(is_training=train_flag) self.output_queue.put(payoffs) # For training else: for _ in range(tasks): trajectories, _ = self.env.run(is_training=train_flag) self.output_queue.put(trajectories) self.input_queue.task_done() else: self.input_queue.task_done() break self.sess.close() return
def test_save_and_load(self): env = rlcard.make('leduc-holdem', config={'allow_step_back': True}) agent = CFRAgent(env) for _ in range(100): agent.train() agent.save() new_agent = CFRAgent(env) new_agent.load() self.assertEqual(len(agent.policy), len(new_agent.policy)) self.assertEqual(len(agent.average_policy), len(new_agent.average_policy)) self.assertEqual(len(agent.regrets), len(new_agent.regrets)) self.assertEqual(agent.iteration, new_agent.iteration)
def train(args): # Make the environment env = rlcard.make(args.env) # Initialize the DMC trainer trainer = DMCTrainer(env, load_model=args.load_model, xpid=args.xpid, savedir=args.savedir, save_interval=args.save_interval, num_actor_devices=args.num_actor_devices, num_actors=args.num_actors, training_device=args.training_device) # Train DMC Agents trainer.start()
def main(): warnings.simplefilter(action='ignore', category=FutureWarning) set_global_seed(0) env = rlcard.make('limit-holdem', config={'record_action': True}) human_agent = HumanAgent(env.action_num) dqn_agent = DQNAgent(env.action_num, env.state_shape[0], hidden_neurons=[1024, 512, 1024, 512]) dqn_agent.load(sys.argv[1]) env.set_agents([human_agent, dqn_agent]) play(env)
def __init__(self, name, num_players, obs_shape): super().__init__() self.name = name self.env = rlcard.make(name) if not hasattr(self, "agents"): self.agents = [f'player_{i}' for i in range(num_players)] self.possible_agents = self.agents[:] dtype = self.env.reset()[0]['obs'].dtype if dtype == np.dtype(np.int64): self._dtype = np.dtype(np.int8) elif dtype == np.dtype(np.float64): self._dtype = np.dtype(np.float32) else: self._dtype = dtype self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=obs_shape, dtype=self._dtype) for _ in range(self.num_agents)]) self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])
def __init__(self): ''' Load pretrained model ''' import tensorflow as tf from rlcard.agents import NFSPAgent, RandomAgent self.graph = tf.Graph() # Mitigation for gpu memory issue config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(graph=self.graph, config=config) env = rlcard.make('tractor') with self.graph.as_default(): self.nfsp_agents = [] # for i in range(env.player_num): # agent = NFSPAgent(self.sess, # scope='nfsp' + str(i), # action_num=env.action_num, # state_shape=env.state_shape, # hidden_layers_sizes=[512,1024,2048,1024,512], # q_mlp_layers=[512,1024,2048,1024,512]) # self.nfsp_agents.append(agent) for i in range(1): agent = NFSPAgent(self.sess, scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[2048,2048], q_mlp_layers=[2048,2048], # evaluate_with='average_policy') evaluate_with='best_response') self.nfsp_agents.append(agent) check_point_path = os.path.join(TRACTOR_PATH, 'nfsp_continue_350k_0.99') with self.sess.as_default(): with self.graph.as_default(): saver = tf.train.Saver() saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
def __init__(self): ''' Load pretrained model ''' env = rlcard.make('leduc-holdem') self.nfsp_agents = [] for i in range(env.player_num): agent = NFSPAgentPytorch(scope='nfsp' + str(i), action_num=env.action_num, state_shape=env.state_shape, hidden_layers_sizes=[128, 128], q_mlp_layers=[128, 128], device=torch.device('cpu')) self.nfsp_agents.append(agent) check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp_pytorch/model.pth') checkpoint = torch.load(check_point_path) for agent in self.nfsp_agents: agent.load(checkpoint)
def __init__(self, name, num_players, obs_shape): super().__init__() self.name = name self.num_players = num_players config = { 'allow_step_back': False, 'seed': None, 'game_num_players': num_players } self.env = rlcard.make(name, config) self.screen = None if not hasattr(self, "agents"): self.agents = [f'player_{i}' for i in range(num_players)] self.possible_agents = self.agents[:] dtype = self.env.reset()[0]['obs'].dtype if dtype == np.dtype(np.int64): self._dtype = np.dtype(np.int8) elif dtype == np.dtype(np.float64): self._dtype = np.dtype(np.float32) else: self._dtype = dtype self.observation_spaces = self._convert_to_dict([ spaces.Dict({ 'observation': spaces.Box(low=0.0, high=1.0, shape=obs_shape, dtype=self._dtype), 'action_mask': spaces.Box(low=0, high=1, shape=(self.env.num_actions, ), dtype=np.int8) }) for _ in range(self.num_agents) ]) self.action_spaces = self._convert_to_dict([ spaces.Discrete(self.env.num_actions) for _ in range(self.num_agents) ])
def run(args): # Make environment env = rlcard.make(args.env, config={'seed': 42}) # Seed numpy, torch, random set_seed(42) # Set agents agent = RandomAgent(num_actions=env.num_actions) env.set_agents([agent for _ in range(env.num_players)]) # Generate data from the environment trajectories, player_wins = env.run(is_training=False) # Print out the trajectories print('\nTrajectories:') print(trajectories) print('\nSample raw observation:') pprint.pprint(trajectories[0][0]['raw_obs']) print('\nSample raw legal_actions:') pprint.pprint(trajectories[0][0]['raw_legal_actions'])
def evaluate(args): # Check whether gpu is available device = get_device() # Seed numpy, torch, random set_seed(args.seed) # Make the environment with seed env = rlcard.make(args.env, config={'seed': args.seed}) # Load models agents = [] for position, model_path in enumerate(args.models): agents.append(load_model(model_path, env, position, device)) env.set_agents(agents) # Evaluate rewards = tournament(env, args.num_games) for position, reward in enumerate(rewards): print(position, args.models[position], reward)
def test_train(self): num_iterations = 10 sess = tf.InteractiveSession() env = rlcard.make('leduc-holdem', {'allow_step_back': True}) agent = DeepCFR(session=sess, scope='deepcfr', env=env, policy_network_layers=(128, 128), advantage_network_layers=(128, 128), num_traversals=1, num_step=1, learning_rate=1e-4, batch_size_advantage=64, batch_size_strategy=64, memory_capacity=int(1e5)) # Test train for _ in range(num_iterations): agent.train() # Test eval_step state = { 'obs': np.random.random_sample(env.state_shape), 'legal_actions': [a for a in range(env.action_num)] } action, _ = agent.eval_step(state) self.assertIn(action, [a for a in range(env.action_num)]) # Test simulate other action = agent.simulate_other(0, state) self.assertIn(action, [a for a in range(env.action_num)]) # Test action advantage advantages = agent.action_advantage(state, 0) self.assertEqual(advantages.shape[0], env.action_num) sess.close() tf.reset_default_graph()
def test_init(self): sess = tf.InteractiveSession() env = rlcard.make('leduc-holdem', allow_step_back=True) agent = DeepCFR(session=sess, env=env, policy_network_layers=(4,4), advantage_network_layers=(4,4), num_traversals=1, num_step=1, learning_rate=1e-4, batch_size_advantage=10, batch_size_strategy=10, memory_capacity=int(1e7)) self.assertEqual(agent._num_traversals, 1) self.assertEqual(agent._num_step, 1) self.assertEqual(agent._batch_size_advantage, 10) self.assertEqual(agent._batch_size_strategy, 10) sess.close() tf.reset_default_graph()