Ejemplo n.º 1
0
    def __init__(self):
        """ Load pretrained model
        """
        super().__init__()
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        env = rlcard.make('leduc-holdem')
        with self.graph.as_default():
            self.nfsp_agents = []
            for i in range(env.player_num):
                agent = NFSPAgent(self.sess,
                                  scope='nfsp' + str(i),
                                  action_num=env.action_num,
                                  state_shape=env.state_shape,
                                  hidden_layers_sizes=[128, 128],
                                  q_norm_step=1000,
                                  q_mlp_layers=[128, 128])
                self.nfsp_agents.append(agent)
            normalize(env, self.nfsp_agents, 1000)
            self.sess.run(tf.global_variables_initializer())

        check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver(tf.model_variables())
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))
Ejemplo n.º 2
0
def load_nfsp_leduc_agent(model_path):
    # Set a global seed
    set_global_seed(0)

    # Load pretrained model
    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    with graph.as_default():
        nfsp_agents = []
        for i in range(env.player_num):
            agent = NFSPAgent(sess,
                              scope='nfsp' + str(i),
                              action_num=env.action_num,
                              state_shape=env.state_shape,
                              hidden_layers_sizes=[128, 128],
                              q_mlp_layers=[128, 128])
            nfsp_agents.append(agent)

    # We have a pretrained model here. Change the path for your model.
    # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp')
    check_point_path = model_path

    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

    return nfsp_agents[0]
Ejemplo n.º 3
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        from rlcard.agents.nfsp_agent import NFSPAgent
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        env = rlcard.make('leduc-holdem')
        with self.graph.as_default():
            self.nfsp_agents = []
            for i in range(env.player_num):
                agent = NFSPAgent(self.sess,
                                  scope='nfsp' + str(i),
                                  action_num=env.action_num,
                                  state_shape=env.state_shape,
                                  hidden_layers_sizes=[128,128],
                                  q_mlp_layers=[128,128])
                self.nfsp_agents.append(agent)

        check_point_path = os.path.join(ROOT_PATH, 'leduc_holdem_nfsp')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
Ejemplo n.º 4
0
    def test_train(self):

        memory_init_size = 20
        num_steps = 1000

        agent = NFSPAgent(num_actions=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          reservoir_buffer_capacity=50,
                          batch_size=4,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=50,
                          q_replay_memory_init_size=memory_init_size,
                          q_batch_size=4,
                          q_mlp_layers=[10, 10],
                          device=torch.device('cpu'))

        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': {
                0: None,
                1: None
            },
            'raw_legal_actions': ['call', 'raise']
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(num_steps):
            agent.sample_episode_policy()
            predicted_action = agent.step({
                'obs': np.random.random_sample((2, )),
                'legal_actions': {
                    0: None,
                    1: None
                }
            })
            self.assertGreaterEqual(predicted_action, 0)
            self.assertLessEqual(predicted_action, 1)

            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': {
                    0: None,
                    1: None
                }
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': {
                          0: None,
                          1: None
                      },
                      'raw_legal_actions': ['call', 'raise']
                  }, True]
            agent.feed(ts)
Ejemplo n.º 5
0
    def test_init(self):

        agent = NFSPAgent(num_actions=10,
                          state_shape=[10],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10],
                          device=torch.device('cpu'))

        self.assertEqual(agent._num_actions, 10)
Ejemplo n.º 6
0
    def test_train(self):

        memory_init_size = 20
        step_num = 1000

        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)
        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          reservoir_buffer_capacity=50,
                          batch_size=4,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=50,
                          q_replay_memory_init_size=memory_init_size,
                          q_batch_size=4,
                          q_mlp_layers=[10, 10])
        sess.run(tf.compat.v1.global_variables_initializer())

        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        for _ in range(step_num):
            agent.sample_episode_policy()
            predicted_action = agent.step({
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            })
            self.assertGreaterEqual(predicted_action, 0)
            self.assertLessEqual(predicted_action, 1)

            ts = [{
                'obs': np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            },
                  np.random.randint(2), 0, {
                      'obs': np.random.random_sample((2, )),
                      'legal_actions': [0, 1]
                  }, True]
            agent.feed(ts)

        sess.close()
        tf.compat.v1.reset_default_graph()
Ejemplo n.º 7
0
    def test_init(self):

        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=10,
                          state_shape=[10],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10])

        self.assertEqual(agent._action_num, 10)

        sess.close()
        tf.compat.v1.reset_default_graph()
Ejemplo n.º 8
0
    def test_evaluate_with(self):
        # Test average policy and value error here
        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10],
                          evaluate_with='average_policy')
        sess.run(tf.compat.v1.global_variables_initializer())
        predicted_action, _ = agent.eval_step({
            'obs':
            np.random.random_sample((2, )),
            'legal_actions': [0, 1]
        })
        self.assertGreaterEqual(predicted_action, 0)
        self.assertLessEqual(predicted_action, 1)

        sess.close()
        tf.compat.v1.reset_default_graph()

        sess = tf.compat.v1.InteractiveSession()
        tf.Variable(0, name='global_step', trainable=False)

        agent = NFSPAgent(sess=sess,
                          scope='nfsp',
                          action_num=2,
                          state_shape=[2],
                          hidden_layers_sizes=[10, 10],
                          q_mlp_layers=[10, 10],
                          evaluate_with='random')
        sess.run(tf.compat.v1.global_variables_initializer())
        with self.assertRaises(ValueError):
            predicted_action = agent.eval_step({
                'obs':
                np.random.random_sample((2, )),
                'legal_actions': [0, 1]
            })

        sess.close()
        tf.compat.v1.reset_default_graph()
Ejemplo n.º 9
0
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[512, 512],
                          anticipatory_param=0.1,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_init_size=memory_init_size,
                          q_norm_step=norm_step,
                          q_mlp_layers=[512, 512])
        agents.append(agent)

    sess.run(tf.global_variables_initializer())

    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
    eval_env.set_agents([agents[0], random_agent])

    # Count the number of steps
    step_counters = [0 for _ in range(env.player_num)]
Ejemplo n.º 10
0
def train_mahjong():

    # Make environment
    env = rlcard.make('mahjong', config={'seed': 0})
    eval_env = rlcard.make('mahjong', config={'seed': 0})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 1000
    evaluate_num = 1000
    episode_num = 10000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 64

    # The paths for saving the logs and learning curves
    log_dir = './experiments/mahjong_nfsp_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agents = []
        for i in range(env.player_num):
            agent = NFSPAgent(sess,
                              scope='nfsp' + str(i),
                              action_num=env.action_num,
                              state_shape=env.state_shape,
                              hidden_layers_sizes=[512, 512],
                              anticipatory_param=0.5,
                              batch_size=256,
                              rl_learning_rate=0.00005,
                              sl_learning_rate=0.00001,
                              min_buffer_size_to_learn=memory_init_size,
                              q_replay_memory_size=int(1e5),
                              q_replay_memory_init_size=memory_init_size,
                              train_every=train_every,
                              q_train_every=train_every,
                              q_batch_size=256,
                              q_mlp_layers=[512, 512])
            agents.append(agent)
        random_agent = RandomAgent(action_num=eval_env.action_num)

        env.set_agents(agents)
        eval_env.set_agents(
            [agents[0], random_agent, random_agent, random_agent])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent

        logger = Logger(log_dir)

        for episode in tqdm(range(episode_num)):

            # First sample a policy for the episode
            for agent in agents:
                agent.sample_episode_policy()

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for i in range(env.player_num):
                for ts in trajectories[i]:
                    agents[i].feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[0])

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('NFSP')

        # Save model
        save_dir = 'models/mahjong_nfsp'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(save_dir, 'model'))
Ejemplo n.º 11
0
# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[128, 128],
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_init_size=memory_init_size,
                          train_every=train_every,
                          q_train_every=train_every,
                          q_mlp_layers=[128, 128])
        agents.append(agent)
    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
    eval_env.set_agents([agents[0], random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)
Ejemplo n.º 12
0
# Set a global seed
set_global_seed(0)

with tf.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[512, 1024, 2048, 1024, 512],
                          anticipatory_param=0.5,
                          batch_size=256,
                          rl_learning_rate=0.00005,
                          sl_learning_rate=0.00001,
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_size=int(1e5),
                          q_replay_memory_init_size=memory_init_size,
                          q_norm_step=norm_step,
                          q_batch_size=256,
                          q_mlp_layers=[512, 1024, 2048, 1024, 512])
        agents.append(agent)

    sess.run(tf.global_variables_initializer())

    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
    eval_env.set_agents([agents[0], random_agent, random_agent])
Ejemplo n.º 13
0
# Make environment
env = rlcard.make('leduc-holdem')

# Set a global seed
set_global_seed(0)

# Load pretrained model
graph = tf.Graph()
sess = tf.Session(graph=graph)

with graph.as_default():
    nfsp_agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[128, 128],
                          q_mlp_layers=[128, 128])
        nfsp_agents.append(agent)

# We have a pretrained model here. Change the path for your model.
check_point_path = os.path.join(rlcard.__path__[0],
                                'models/pretrained/leduc_holdem_nfsp')

with sess.as_default():
    with graph.as_default():
        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

# Evaluate the performance. Play with random agents.
evaluate_num = 10000
Ejemplo n.º 14
0
# Set a global seed
set_global_seed(0)

with tf.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          anticipatory_param=0.1,
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          rl_learning_rate=0.1,
                          sl_learning_rate=0.005,
                          hidden_layers_sizes=[128, 128],
                          min_buffer_size_to_learn=memory_init_size,
                          q_replay_memory_init_size=memory_init_size,
                          q_epsilon_start=0.06,
                          q_epsilon_end=0.0,
                          q_norm_step=norm_step,
                          q_mlp_layers=[128, 128])
        agents.append(agent)

    sess.run(tf.global_variables_initializer())

    saver = tf.train.Saver()

    random_agent = RandomAgent(action_num=eval_env.action_num)

    env.set_agents(agents)
Ejemplo n.º 15
0
### Step 2: Initialize the NFSP agents. ###
import tensorflow.compat.v1 as tf
from rlcard.agents.nfsp_agent import NFSPAgent
tf.disable_v2_behavior()
memory_init_size = 1000
norm_step = 100
with tf.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(sess,
                          scope='nfsp' + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          hidden_layers_sizes=[128, 128],
                          min_buffer_size_to_learn=1000,
                          q_replay_memory_init_size=memory_init_size,
                          q_update_target_estimator_every=norm_step,
                          q_mlp_layers=[128, 128])
        agents.append(agent)
    # with sess.as_default():  #uncomment when loading
    #     saver = tf.train.Saver()
    #     saver.restore(sess, tf.train.latest_checkpoint(save_dir))

    sess.run(tf.global_variables_initializer())  # comment out when loading
    env.set_agents(agents)  # Setup all nfsp agents into training environments

    # Setup random agent for evaluation
    random_agent = RandomAgent(action_num=eval_env.action_num)
    eval_env.set_agents([agents[0], random_agent])