Example #1
0
def train_mahjong():
    # Make environment and enable human mode
    env = rlcard.make('mahjong', config={'seed': 0, 'allow_step_back': True})
    eval_env = rlcard.make('mahjong', config={'seed': 0})

    # Set the iterations numbers and how frequently we evaluate the performance and save model
    evaluate_every = 100
    save_plot_every = 1000
    evaluate_num = 10000
    episode_num = 10000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/mahjong_emccfr_result/'

    # Set a global seed
    set_global_seed(0)

    # Initilize CFR Agent
    model_path = 'models/mahjong_oscfr'
    agent = OutcomeSampling_CFR(env, model_path=model_path)
    agent.load()  # If we have saved model, we first load the model

    # Evaluate CFR against pre-trained NFSP
    eval_env.set_agents([agent, models.load('mahjong-nfsp').agents[0]])

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):
        agent.train()
        print('\rIteration {}'.format(episode), end='')
        # Evaluate the performance. Play with NFSP agents.
        if episode % evaluate_every == 0:
            agent.save()  # Save model
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0])

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('EMCCFR')
Example #2
0
 def test_config_chips(self):
     env = rlcard.make('no-limit-holdem',
                       config={
                           'game_player_num': 5,
                           'chips_for_each': [100, 200, 400, 600, 900]
                       })
     env.game.init_game()
     players = env.game.players
     chips = []
     for i in range(5):
         chips.append(players[i].remained_chips + players[i].in_chips)
     self.assertEqual(chips, [100, 200, 400, 600, 900])
Example #3
0
    def __init__(self, seed=None):
        super().__init__()
        self.env = rlcard.make('mahjong', config={"seed": seed})
        self.agents = ['player_0', 'player_1', 'player_2', 'player_3']
        self.num_agents = len(self.agents)
        self.has_reset = False

        self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(6, 34, 4), dtype=np.bool) for _ in range(self.num_agents)])
        self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])

        self.agent_order = list(self.agents)
        self._agent_selector = agent_selector(self.agent_order)
Example #4
0
def is_deterministic(env_name):
    env = rlcard.make(env_name)

    NUM_STEPS = 25

    actions = [
        random.randrange(env.game.get_num_actions()) for _ in range(NUM_STEPS)
    ]
    base_seed = 12941
    hashes = []
    for rand_iters in range(2):
        env = rlcard.make(env_name, config={'seed': base_seed})

        hashes.append(
            hash(
                tuple([
                    hash_obsevation(obs['obs'])
                    for obs in gather_observations(env, actions, rand_iters)
                ])))

    return hashes[0] == hashes[1]
Example #5
0
    def __init__(self, seed=None):
        super().__init__()
        self.env = rlcard.make('doudizhu', config={"seed": seed})
        self.agents = ['landlord_0', 'peasant_0', 'peasant_1']
        self.num_agents = len(self.agents)
        self.has_reset = False

        self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(6, 5, 15), dtype=np.bool) for _ in range(self.num_agents)])
        self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])

        self.agent_order = self.agents
        self._agent_selector = agent_selector(self.agent_order)
Example #6
0
    def test_train(self):

        env = rlcard.make('leduc-holdem', allow_step_back=True)
        agent = CFRAgent(env)

        for _ in range(100):
            agent.train()

        state = {'obs': np.array([1., 1., 0., 0., 0., 0.]), 'legal_actions': [0,2]}
        action = agent.eval_step(state)

        self.assertIn(action, [0, 2])
    def test_single_agent_mode(self):
        env = rlcard.make('leduc-holdem', config={'single_agent_mode':True})
        with self.assertRaises(ValueError):
            env.set_agents([])

        with self.assertRaises(ValueError):
            env.run()

        state = env.reset()
        self.assertIsInstance(state, dict)
        for _ in range(100):
            state, _, _ = env.step(np.random.choice(state['legal_actions']))
Example #8
0
 def test_decode_action(self):
     env = rlcard.make('doudizhu')
     env.reset()
     env.game.state['actions'] = ['33366', '33355']
     env.game.judger.playable_cards[0] = [
         '5', '6', '55', '555', '33366', '33355'
     ]
     decoded = env._decode_action(3)
     self.assertEqual(decoded, '6')
     env.game.state['actions'] = ['444', '44466', '44455']
     decoded = env._decode_action(29)
     self.assertEqual(decoded, '444')
 def test_decode_action(self):
     env = rlcard.make('simple-doudizhu')
     env.reset()
     env.game.state['actions'] = ['888TT', '88899']
     env.game.judger.playable_cards[0] = [
         '9', 'T', '99', '999', '888TT', '88899'
     ]
     decoded = env._decode_action(28)
     self.assertEqual(decoded, '888TT')
     env.game.state['actions'] = ['888', '88899', '888TT']
     decoded = env._decode_action(14)
     self.assertEqual(decoded, '888')
Example #10
0
    def test_decode_action(self):
        env = rlcard.make('no-limit-holdem')
        state, _ = env.init_game()
        for action in state['legal_actions']:
            decoded = env._decode_action(action)
            self.assertIn(decoded, env.actions)

        decoded = env._decode_action(3)
        self.assertEqual(decoded, 'fold')

        env.step(0)
        decoded = env._decode_action(0)
        self.assertEqual(decoded, 'check')
Example #11
0
    def test_decode_action(self):
        env = rlcard.make('no-limit-holdem')
        state, _ = env.reset()
        for action in state['legal_actions']:
            decoded = env._decode_action(action)
            self.assertIn(decoded, env.actions)

        decoded = env._decode_action(Action.FOLD.value)
        self.assertEqual(decoded, Action.FOLD)

        env.step(0)
        decoded = env._decode_action(1)
        self.assertEqual(decoded, Action.CHECK)
Example #12
0
 def test_run(self):
     env = rlcard.make('gin-rummy')
     env.set_agents(
         [RandomAgent(env.num_actions) for _ in range(env.num_players)])
     trajectories, payoffs = env.run(is_training=False)
     self.assertEqual(len(trajectories), 2)
     for payoff in payoffs:
         self.assertLessEqual(-1, payoff)
         self.assertLessEqual(payoff, 1)
     trajectories, payoffs = env.run(is_training=True)
     for payoff in payoffs:
         self.assertLessEqual(-1, payoff)
         self.assertLessEqual(payoff, 1)
Example #13
0
    def __init__(self, seed=None):
        super().__init__()
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
        self.env = rlcard.make('leduc-holdem', config={"seed": seed})
        self.agents = ['player_0', 'player_1']
        self.num_agents = len(self.agents)
        self.has_reset = False

        self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=(36,), dtype=np.bool) for _ in range(self.num_agents)])
        self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])
        self.agent_order = self.agents
        self._agent_selector = agent_selector(self.agent_order)
Example #14
0
 def test_run(self):
     env = rlcard.make('uno')
     env.set_agents([RandomAgent(env.action_num) for _ in range(env.player_num)])
     trajectories, payoffs = env.run(is_training=False)
     self.assertEqual(len(trajectories), 2)
     total = 0
     for payoff in payoffs:
         total += payoff
     self.assertEqual(total, 0)
     trajectories, payoffs = env.run(is_training=True)
     total = 0
     for payoff in payoffs:
         total += payoff
     self.assertEqual(total, 0)
Example #15
0
def train():
    env = rlcard.make('mahjong', {'allow_step_back': True})
    # env = rlcard.make('mahjong')

    # Set the iterations numbers and how frequently we evaluate/save plot
    evaluate_every = 100
    save_plot_every = 1000
    evaluate_num = 10000
    episode_num = 100000

    # The paths for saving the logs and learning curves
    root_path = './experiments/mahjong_cfr_result/'
    log_path = root_path + 'log.txt'
    csv_path = root_path + 'performance.csv'
    figure_path = root_path + 'figures/'

    # Set a global seed
    set_global_seed(0)

    # Initilize CFR Agent
    agent = MCCFRAgent(env)
    # Init a Logger to plot the learning curve
    logger = Logger(root_path)

    for episode in range(episode_num + 1):
        agent.train()
        print('\rIteration {}'.format(episode), end='')
        if episode % 5000 == 0:
            agent.save(episode)
        # # Evaluate the performance. Play with NFSP agents.
        # if episode % evaluate_every == 0:
        #     reward = 0
        #     for eval_episode in range(evaluate_num):
        #         _, payoffs = eval_env.run(is_training=False)
        #
        #         reward += payoffs[0]
        #
        #     logger.log('\n########## Evaluation ##########')
        #     logger.log('Iteration: {} Average reward is {}'.format(episode, float(reward)/evaluate_num))
        #
        #     # Add point to logger
        #     logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)
        #
        # # Make plot
        # if episode % save_plot_every == 0 and episode > 0:
        #     logger.make_plot(save_path=figure_path+str(episode)+'.png')

    # Make the final plot
    logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')
Example #16
0
 def __init__(self):
     env = rlcard.make('uno')
     self.sess1 = tf.compat.v1.Session()
     global_step = tf.Variable(0, name='global_step', trainable=False)
     self.agent = DQNAgent(self.sess1,
                           scope='dqn',
                           action_num=env.action_num,
                           replay_memory_init_size=memory_init_size,
                           norm_step=norm_step,
                           state_shape=env.state_shape,
                           mlp_layers=[100, 100])
     self.sess1.run(tf.global_variables_initializer())
     self.saver = tf.train.Saver()
     self.saver.restore(self.sess1,
                        './experiments/uno_dqn_result/models/model1.ckpt')
    def __init__(self, seed=None):
        super().__init__()
        if seed is not None:
            np.random.seed(seed)
            random.seed(seed)
        self.env = rlcard.make('no-limit-holdem', config={"seed": seed})
        self.agents = ['player_0', 'player_1']
        self.num_agents = len(self.agents)
        self.has_reset = False

        self.observation_spaces = self._convert_to_dict([spaces.Box(low=np.zeros(54,), high=np.append(np.ones(52,), [100, 100]), dtype=np.float64) for _ in range(self.num_agents)])
        self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])

        self.agent_order = self.agents
        self._agent_selector = agent_selector(self.agent_order)
Example #18
0
 def test_run(self):
     env = rlcard.make('doudizhu')
     env.set_agents(
         [RandomAgent(env.num_actions) for _ in range(env.num_players)])
     trajectories, payoffs = env.run(is_training=False)
     self.assertEqual(len(trajectories), 3)
     win = []
     for player_id, payoff in enumerate(payoffs):
         if payoff == 1:
             win.append(player_id)
     if len(win) == 1:
         self.assertEqual(env.game.players[win[0]].role, 'landlord')
     if len(win) == 2:
         self.assertEqual(env.game.players[win[0]].role, 'peasant')
         self.assertEqual(env.game.players[win[1]].role, 'peasant')
Example #19
0
    def run(self):
        #import tensorflow as tf
        self.env = rlcard.make('blackjack')
        self.sess = tf.Session()
        agent = DQNAgent(self.sess,
                         scope='sub-dqn' + str(self.index),
                         action_num=self.env.action_num,
                         replay_memory_init_size=memory_init_size,
                         norm_step=norm_step,
                         state_shape=self.env.state_shape,
                         mlp_layers=[10, 10])
        self.env.set_agents([agent])
        self.sess.run(tf.global_variables_initializer())

        # normalize
        for _ in range(norm_step):
            trajectories, _ = self.env.run()
            for ts in trajectories[0]:
                agent.feed(ts)

        # Receive instruction to run game and generate trajectories
        while True:
            instruction = self.input_queue.get()
            if instruction is not None:
                tasks, train_flag, variables, total_t = instruction

                # For evaluation
                if not train_flag:
                    agent.total_t = total_t
                    global_vars = [
                        tf.convert_to_tensor(var) for var in variables
                    ]
                    agent.copy_params_op(global_vars)
                    for _ in range(tasks):
                        _, payoffs = self.env.run(is_training=train_flag)
                        self.output_queue.put(payoffs)

                # For training
                else:
                    for _ in range(tasks):
                        trajectories, _ = self.env.run(is_training=train_flag)
                        self.output_queue.put(trajectories)
                self.input_queue.task_done()
            else:
                self.input_queue.task_done()
                break
        self.sess.close()
        return
Example #20
0
    def test_save_and_load(self):
        env = rlcard.make('leduc-holdem', config={'allow_step_back': True})
        agent = CFRAgent(env)

        for _ in range(100):
            agent.train()

        agent.save()

        new_agent = CFRAgent(env)
        new_agent.load()
        self.assertEqual(len(agent.policy), len(new_agent.policy))
        self.assertEqual(len(agent.average_policy),
                         len(new_agent.average_policy))
        self.assertEqual(len(agent.regrets), len(new_agent.regrets))
        self.assertEqual(agent.iteration, new_agent.iteration)
Example #21
0
def train(args):

    # Make the environment
    env = rlcard.make(args.env)

    # Initialize the DMC trainer
    trainer = DMCTrainer(env,
                         load_model=args.load_model,
                         xpid=args.xpid,
                         savedir=args.savedir,
                         save_interval=args.save_interval,
                         num_actor_devices=args.num_actor_devices,
                         num_actors=args.num_actors,
                         training_device=args.training_device)

    # Train DMC Agents
    trainer.start()
Example #22
0
def main():
    warnings.simplefilter(action='ignore', category=FutureWarning)

    set_global_seed(0)

    env = rlcard.make('limit-holdem', config={'record_action': True})
    human_agent = HumanAgent(env.action_num)

    dqn_agent = DQNAgent(env.action_num,
                         env.state_shape[0],
                         hidden_neurons=[1024, 512, 1024, 512])

    dqn_agent.load(sys.argv[1])

    env.set_agents([human_agent, dqn_agent])

    play(env)
Example #23
0
    def __init__(self, name, num_players, obs_shape):
        super().__init__()
        self.name = name
        self.env = rlcard.make(name)
        if not hasattr(self, "agents"):
            self.agents = [f'player_{i}' for i in range(num_players)]
        self.possible_agents = self.agents[:]

        dtype = self.env.reset()[0]['obs'].dtype
        if dtype == np.dtype(np.int64):
            self._dtype = np.dtype(np.int8)
        elif dtype == np.dtype(np.float64):
            self._dtype = np.dtype(np.float32)
        else:
            self._dtype = dtype

        self.observation_spaces = self._convert_to_dict([spaces.Box(low=0.0, high=1.0, shape=obs_shape, dtype=self._dtype) for _ in range(self.num_agents)])
        self.action_spaces = self._convert_to_dict([spaces.Discrete(self.env.game.get_action_num()) for _ in range(self.num_agents)])
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        from rlcard.agents import NFSPAgent, RandomAgent
        self.graph = tf.Graph()

        # Mitigation for gpu memory issue
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        self.sess = tf.Session(graph=self.graph, config=config)

        env = rlcard.make('tractor')
        with self.graph.as_default():
            self.nfsp_agents = []
            # for i in range(env.player_num):
            #     agent = NFSPAgent(self.sess,
            #                       scope='nfsp' + str(i),
            #                       action_num=env.action_num,
            #                       state_shape=env.state_shape,
            #                       hidden_layers_sizes=[512,1024,2048,1024,512],
            #                       q_mlp_layers=[512,1024,2048,1024,512])
            #     self.nfsp_agents.append(agent)

            for i in range(1):
                agent = NFSPAgent(self.sess,
                                scope='nfsp' + str(i),
                                action_num=env.action_num,
                                state_shape=env.state_shape,
                                hidden_layers_sizes=[2048,2048],
                                q_mlp_layers=[2048,2048],
                                # evaluate_with='average_policy')
                                evaluate_with='best_response')

                self.nfsp_agents.append(agent)

        check_point_path = os.path.join(TRACTOR_PATH, 'nfsp_continue_350k_0.99')

        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
Example #25
0
    def __init__(self):
        ''' Load pretrained model
        '''
        env = rlcard.make('leduc-holdem')
        self.nfsp_agents = []
        for i in range(env.player_num):
            agent = NFSPAgentPytorch(scope='nfsp' + str(i),
                                     action_num=env.action_num,
                                     state_shape=env.state_shape,
                                     hidden_layers_sizes=[128, 128],
                                     q_mlp_layers=[128, 128],
                                     device=torch.device('cpu'))
            self.nfsp_agents.append(agent)

        check_point_path = os.path.join(ROOT_PATH,
                                        'leduc_holdem_nfsp_pytorch/model.pth')
        checkpoint = torch.load(check_point_path)
        for agent in self.nfsp_agents:
            agent.load(checkpoint)
Example #26
0
    def __init__(self, name, num_players, obs_shape):
        super().__init__()
        self.name = name
        self.num_players = num_players
        config = {
            'allow_step_back': False,
            'seed': None,
            'game_num_players': num_players
        }

        self.env = rlcard.make(name, config)
        self.screen = None
        if not hasattr(self, "agents"):
            self.agents = [f'player_{i}' for i in range(num_players)]
        self.possible_agents = self.agents[:]

        dtype = self.env.reset()[0]['obs'].dtype
        if dtype == np.dtype(np.int64):
            self._dtype = np.dtype(np.int8)
        elif dtype == np.dtype(np.float64):
            self._dtype = np.dtype(np.float32)
        else:
            self._dtype = dtype

        self.observation_spaces = self._convert_to_dict([
            spaces.Dict({
                'observation':
                spaces.Box(low=0.0,
                           high=1.0,
                           shape=obs_shape,
                           dtype=self._dtype),
                'action_mask':
                spaces.Box(low=0,
                           high=1,
                           shape=(self.env.num_actions, ),
                           dtype=np.int8)
            }) for _ in range(self.num_agents)
        ])
        self.action_spaces = self._convert_to_dict([
            spaces.Discrete(self.env.num_actions)
            for _ in range(self.num_agents)
        ])
Example #27
0
def run(args):
    # Make environment
    env = rlcard.make(args.env, config={'seed': 42})

    # Seed numpy, torch, random
    set_seed(42)

    # Set agents
    agent = RandomAgent(num_actions=env.num_actions)
    env.set_agents([agent for _ in range(env.num_players)])

    # Generate data from the environment
    trajectories, player_wins = env.run(is_training=False)
    # Print out the trajectories
    print('\nTrajectories:')
    print(trajectories)
    print('\nSample raw observation:')
    pprint.pprint(trajectories[0][0]['raw_obs'])
    print('\nSample raw legal_actions:')
    pprint.pprint(trajectories[0][0]['raw_legal_actions'])
Example #28
0
def evaluate(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={'seed': args.seed})

    # Load models
    agents = []
    for position, model_path in enumerate(args.models):
        agents.append(load_model(model_path, env, position, device))
    env.set_agents(agents)

    # Evaluate
    rewards = tournament(env, args.num_games)
    for position, reward in enumerate(rewards):
        print(position, args.models[position], reward)
Example #29
0
    def test_train(self):

        num_iterations = 10

        sess = tf.InteractiveSession()
        env = rlcard.make('leduc-holdem', {'allow_step_back': True})
        agent = DeepCFR(session=sess,
                        scope='deepcfr',
                        env=env,
                        policy_network_layers=(128, 128),
                        advantage_network_layers=(128, 128),
                        num_traversals=1,
                        num_step=1,
                        learning_rate=1e-4,
                        batch_size_advantage=64,
                        batch_size_strategy=64,
                        memory_capacity=int(1e5))

        # Test train
        for _ in range(num_iterations):
            agent.train()

        # Test eval_step
        state = {
            'obs': np.random.random_sample(env.state_shape),
            'legal_actions': [a for a in range(env.action_num)]
        }
        action, _ = agent.eval_step(state)
        self.assertIn(action, [a for a in range(env.action_num)])

        # Test simulate other
        action = agent.simulate_other(0, state)
        self.assertIn(action, [a for a in range(env.action_num)])

        # Test action advantage
        advantages = agent.action_advantage(state, 0)
        self.assertEqual(advantages.shape[0], env.action_num)

        sess.close()
        tf.reset_default_graph()
Example #30
0
    def test_init(self):

        sess = tf.InteractiveSession()
        env = rlcard.make('leduc-holdem', allow_step_back=True)
        agent = DeepCFR(session=sess,
                        env=env,
                        policy_network_layers=(4,4),
                        advantage_network_layers=(4,4),
                        num_traversals=1,
                        num_step=1,
                        learning_rate=1e-4,
                        batch_size_advantage=10,
                        batch_size_strategy=10,
                        memory_capacity=int(1e7))

        self.assertEqual(agent._num_traversals, 1)
        self.assertEqual(agent._num_step, 1)
        self.assertEqual(agent._batch_size_advantage, 10)
        self.assertEqual(agent._batch_size_strategy, 10)

        sess.close()
        tf.reset_default_graph()