Ejemplo n.º 1
0
def load_nfsp_leduc_agent(model_path):
    # Set a global seed
    set_global_seed(0)

    # Load pretrained model
    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    with graph.as_default():
        nfsp_agents = []
        for i in range(env.player_num):
            agent = NFSPAgent(sess,
                              scope='nfsp' + str(i),
                              action_num=env.action_num,
                              state_shape=env.state_shape,
                              hidden_layers_sizes=[128, 128],
                              q_mlp_layers=[128, 128])
            nfsp_agents.append(agent)

    # We have a pretrained model here. Change the path for your model.
    # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp')
    check_point_path = model_path

    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

    return nfsp_agents[0]
Ejemplo n.º 2
0
def load_dqn_leduc_agent(model_path):
    # Set a global seed
    set_global_seed(0)

    # Load pretrained model
    # tf.reset_default_graph()
    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    with graph.as_default():
        nfsp_agents = []
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 128])

    # We have a pretrained model here. Change the path for your model.
    # check_point_path = os.path.join(rlcard.__path__[0], 'models/pretrained/leduc_holdem_nfsp')
    check_point_path = model_path

    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(check_point_path))
    return agent
Ejemplo n.º 3
0
 def multi_traverse(self, q, player_id, num):
     regretMemory = []
     policyMemory = []
     set_global_seed(os.getpid())
     for i in range(num):
         self.env.init_game()
         probs = np.ones(self.env.player_num)
         self.traverse_tree(max(self.iteration-self.startPolicy, 0), 1, player_id, regretMemory, policyMemory)
     q.put([regretMemory, policyMemory])
     return [regretMemory, policyMemory]
Ejemplo n.º 4
0
def train():
    env = rlcard.make('mahjong', {'allow_step_back': True})
    # env = rlcard.make('mahjong')

    # Set the iterations numbers and how frequently we evaluate/save plot
    evaluate_every = 100
    save_plot_every = 1000
    evaluate_num = 10000
    episode_num = 100000

    # The paths for saving the logs and learning curves
    root_path = './experiments/mahjong_cfr_result/'
    log_path = root_path + 'log.txt'
    csv_path = root_path + 'performance.csv'
    figure_path = root_path + 'figures/'

    # Set a global seed
    set_global_seed(0)

    # Initilize CFR Agent
    agent = MCCFRAgent(env)
    # Init a Logger to plot the learning curve
    logger = Logger(root_path)

    for episode in range(episode_num + 1):
        agent.train()
        print('\rIteration {}'.format(episode), end='')
        if episode % 5000 == 0:
            agent.save(episode)
        # # Evaluate the performance. Play with NFSP agents.
        # if episode % evaluate_every == 0:
        #     reward = 0
        #     for eval_episode in range(evaluate_num):
        #         _, payoffs = eval_env.run(is_training=False)
        #
        #         reward += payoffs[0]
        #
        #     logger.log('\n########## Evaluation ##########')
        #     logger.log('Iteration: {} Average reward is {}'.format(episode, float(reward)/evaluate_num))
        #
        #     # Add point to logger
        #     logger.add_point(x=env.timestep, y=float(reward)/evaluate_num)
        #
        # # Make plot
        # if episode % save_plot_every == 0 and episode > 0:
        #     logger.make_plot(save_path=figure_path+str(episode)+'.png')

    # Make the final plot
    logger.make_plot(save_path=figure_path + 'final_' + str(episode) + '.png')
Ejemplo n.º 5
0
 def traverse(self, agent1, agent2, evaluate_num, eval_env):
     reward = []
     set_global_seed(random.randint(0,100))
     for eval_episode in range(evaluate_num):
         try:
             agent1.oppoCV = None
         except:
             pass
         try:
             agent2.oppoCV = None
         except:
             pass
         his, payoffs = eval_env.run(is_training=False)
         reward.append(payoffs[0])
     #print(reward)
     return np.mean(reward)
Ejemplo n.º 6
0
def main():

    # Make environment
    env = rlcard.make('blackjack')
    episode_num = 2

    # Set a global seed
    set_global_seed(0)

    # Set up agents
    agent_0 = RandomAgent(action_num=env.action_num)
    env.set_agents([agent_0])

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=False)

        # Print out the trajectories
        print('\nEpisode {}'.format(episode))
        for ts in trajectories[0]:
            print(
                'State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.
                format(ts[0], ts[1], ts[2], ts[3], ts[4]))
Ejemplo n.º 7
0
root_path = './experiments/tarot_dqn_result_v{}/'.format(str(record_number))
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'

# Model save path
if not os.path.exists('rlcard/models'):
    os.makedirs('rlcard/models')
    if not os.path.exists('rlcard/models/pretrained'):
        os.makedirs('rlcard/models/pretrained')
        if not os.path.exists('rlcard/models/pretrained/tarot_v' + str(record_number)):
            os.makedirs('rlcard/models/pretrained/tarot_v' + str(record_number))
model_path = 'rlcard/models/pretrained/tarot_v' + str(record_number) + '/model'

# Set a global seed
set_global_seed(0)

with tf.compat.v1.Session() as sess:
    # Set agents
    global_step = tf.Variable(0, name='global_step', trainable=False)
    agent = models[str(against_model)](sess.graph, sess).dqn_agent

    opponent_agent = agent

    sess.run(tf.compat.v1.global_variables_initializer())

    saver = tf.compat.v1.train.Saver()

    env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
    eval_env.set_agents([agent] + [opponent_agent] * (env.player_num - 1))
Ejemplo n.º 8
0
 def test_set_global_seed(self):
     set_global_seed(0)
     self.assertEqual(np.random.get_state()[1][0], 0)
Ejemplo n.º 9
0
def train_mahjong():

    # Make environment
    env = rlcard.make('mahjong', config={'seed': 0})
    eval_env = rlcard.make('mahjong', config={'seed': 0})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 1000
    evaluate_num = 1000
    episode_num = 10000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 64

    # The paths for saving the logs and learning curves
    log_dir = './experiments/mahjong_nfsp_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agents = []
        for i in range(env.player_num):
            agent = NFSPAgent(sess,
                              scope='nfsp' + str(i),
                              action_num=env.action_num,
                              state_shape=env.state_shape,
                              hidden_layers_sizes=[512, 512],
                              anticipatory_param=0.5,
                              batch_size=256,
                              rl_learning_rate=0.00005,
                              sl_learning_rate=0.00001,
                              min_buffer_size_to_learn=memory_init_size,
                              q_replay_memory_size=int(1e5),
                              q_replay_memory_init_size=memory_init_size,
                              train_every=train_every,
                              q_train_every=train_every,
                              q_batch_size=256,
                              q_mlp_layers=[512, 512])
            agents.append(agent)
        random_agent = RandomAgent(action_num=eval_env.action_num)

        env.set_agents(agents)
        eval_env.set_agents(
            [agents[0], random_agent, random_agent, random_agent])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curvefrom rlcard.agents.random_agent import RandomAgent

        logger = Logger(log_dir)

        for episode in tqdm(range(episode_num)):

            # First sample a policy for the episode
            for agent in agents:
                agent.sample_episode_policy()

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for i in range(env.player_num):
                for ts in trajectories[i]:
                    agents[i].feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[0])

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('NFSP')

        # Save model
        save_dir = 'models/mahjong_nfsp'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(save_dir, 'model'))
Ejemplo n.º 10
0
    # Set the number of process
    process_num = 8

    # Set episode_num
    episode_num = 10000

    # Assign tasks
    per_tasks = assign_task(episode_num, process_num)

    # Set game and make environment
    game = 'doudizhu'
    env = rlcard.make(game)

    # Set global seed
    set_global_seed(1)

    # Set up agents
    agent_num = env.player_num
    env.set_agents([RandomAgent(action_num=env.action_num)
                    for _ in range(agent_num)])

    # Set a global list to reserve trajectories
    manager = multiprocessing.Manager()
    trajectories_set = manager.list()

    # Generate Processes
    processes = []
    for p in range(process_num):
        process = multiprocessing.Process(target=env.run_multi, args=(per_tasks[p], trajectories_set))
        processes.append(process)
Ejemplo n.º 11
0
# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_every = 1  #00
save_plot_every = 5  #00
evaluate_num = 5  #0
episode_num = 1  #000

# The paths for saving the logs and learning curves
root_path = './experiments/nolimit_holdem_cfr_result/'
log_path = root_path + 'log.txt'
csv_path = root_path + 'performance.csv'
figure_path = root_path + 'figures/'
log_reward_path = root_path + '_reward_log.txt'
csv_reward_path = root_path + '_reward_performance.csv'
# Set a global seed
set_global_seed(10)

# Initilize CFR Agent
agent = cfr_agent.CFRAgent(env)
#agent.load()  # If we have saved model, we first load the model

# Evaluate CFR against pre-trained NFSP
#eval_env.set_agents([agent, models.load('leduc-holdem-nfsp').agents[0]])
eval_env.set_agents([agent, RandomAgent(action_num=env.action_num)])
# Init a Logger to plot the learning curve
logger = Logger(xlabel='iteration',
                ylabel='exploitability',
                legend='CFR on nolimit Holdem',
                log_path=log_path,
                csv_path=csv_path)
logger_reward = Logger(xlabel='iteration',
Ejemplo n.º 12
0
    def traverse(self, evaluate_num):
        reward = []
        set_global_seed(random.randint(0,100))
        reward.append(self.agent.compute_exploitability(evaluate_num))

        return np.mean(reward)
Ejemplo n.º 13
0
import rlcard
import torch
from rlcard.agents.reinforce_agent import ReinforceAgent
from rlcard.utils.utils import set_global_seed, tournament
from rlcard.utils.logger import Logger

episode_num = 100000
evaluate_num = 10000
evaluate_every = 1000

env = rlcard.make("blackjack")
eval_env = rlcard.make("blackjack")

log_dir = './experiments/blackjack_reinforce_result/'

set_global_seed(42)

agent = ReinforceAgent(scope="reinforce_agent",
                       action_num=env.action_num,
                       state_shape=env.state_shape,
                       discount_factor=0.99,
                       learning_rate=1e-6,
                       device=None)
env.set_agents([agent])
eval_env.set_agents([agent])

logger = Logger(log_dir)

for episode in range(episode_num):
    trajectories, _ = env.run(is_training=True)