Esempio n. 1
0
def train(args):
    # Make environments, CFR only supports Leduc Holdem
    env = rlcard.make('leduc-holdem', config={'seed': 0, 'allow_step_back':True})
    eval_env = rlcard.make('leduc-holdem', config={'seed': 0})

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Initilize CFR Agent
    agent = CFRAgent(env, os.path.join(args.log_dir, 'cfr_model'))
    agent.load()  # If we have saved model, we first load the model

    # Evaluate CFR against random
    eval_env.set_agents([agent, RandomAgent(num_actions=env.num_actions)])

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):
            agent.train()
            print('\rIteration {}'.format(episode), end='')
            # Evaluate the performance. Play with Random agents.
            if episode % args.evaluate_every == 0:
                agent.save() # Save model
                logger.log_performance(env.timestep, tournament(eval_env, args.num_eval_games)[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path
    # Plot the learning curve
    plot_curve(csv_path, fig_path, 'cfr')
Esempio n. 2
0
 def test_vec_env(self):
     env = rlcard.make('limit-holdem', config={'env_num': 4})
     env.set_agents(
         [RandomAgent(env.action_num) for _ in range(env.player_num)])
     trajectories, payoffs = env.run(is_training=False)
     self.assertEqual(len(payoffs), 4)
     trajectories, payoffs = env.run(is_training=True)
Esempio n. 3
0
def load_model(model_path, env=None, position=None, device=None):
    if os.path.isfile(model_path):  # Torch model
        import torch
        agent = torch.load(model_path, map_location=device)
        agent.set_device(device)
    elif os.path.isdir(model_path):  # CFR model
        from rlcard.agents import CFRAgent
        agent = CFRAgent(env, model_path)
        agent.load()
    elif model_path == 'random':  # Random model
        from rlcard.agents import RandomAgent
        agent = RandomAgent(num_actions=env.num_actions)
    else:  # A model in the model zoo
        from rlcard import models
        agent = models.load(model_path).agents[position]

    return agent
Esempio n. 4
0
def train_uno():
    # Make environment and enable human mode
    env = rlcard.make('uno', config={'seed': 0, 'allow_step_back':True})
    eval_env = rlcard.make('uno', config={'seed': 0})

    # Set the iterations numbers and how frequently we evaluate the performance and save model
    evaluate_every = 100
    save_plot_every = 1000
    evaluate_num = 10000
    episode_num = 10000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/uno_cfr_result/'

    # Set a global seed
    set_global_seed(0)

    model_path = 'models/uno_cfr'
    # Initilize CFR Agent
    agent = CFRAgent(env,model_path = model_path)
    agent.load()  # If we have saved model, we first load the model

    # Evaluate CFR against pre-trained NFSP
    random_agent = RandomAgent(action_num=eval_env.action_num)
    eval_env.set_agents([agent, random_agent])

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):
        agent.train()
        print('\rIteration {}'.format(episode), end='')
        # Evaluate the performance. Play with NFSP agents.
        if episode % evaluate_every == 0:
            
            logger.log_performance(env.timestep, tournament(eval_env, evaluate_num)[0])

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('CFR')
Esempio n. 5
0
def run(args):
    # Make environment
    env = rlcard.make(args.env, config={'seed': 42})

    # Seed numpy, torch, random
    set_seed(42)

    # Set agents
    agent = RandomAgent(num_actions=env.num_actions)
    env.set_agents([agent for _ in range(env.num_players)])

    # Generate data from the environment
    trajectories, player_wins = env.run(is_training=False)
    # Print out the trajectories
    print('\nTrajectories:')
    print(trajectories)
    print('\nSample raw observation:')
    pprint.pprint(trajectories[0][0]['raw_obs'])
    print('\nSample raw legal_actions:')
    pprint.pprint(trajectories[0][0]['raw_legal_actions'])
Esempio n. 6
0
def run(path: str, num: int, position: int, opponent: str):
    # Set a global seed
    set_global_seed(123)

    env = make('thousand-schnapsen',
               config={
                   'seed': 0,
                   'force_zero_sum': True
               })
    agents = []
    for _ in range(env.player_num):
        agent = RandomAgent(action_num=env.action_num)
        agents.append(agent)

    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    with graph.as_default():
        agent = DeepCFR(sess,
                        scope=f'deep_cfr{position}',
                        env=env,
                        policy_network_layers=(8 * 24, 4 * 24, 2 * 24, 24),
                        advantage_network_layers=(8 * 24, 4 * 24, 2 * 24, 24))
        if opponent == 'deep_cfr':
            agents[0] = agent
            agents[1] = agent
            agents[2] = agent
        else:
            agents[position] = agent

    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            saver.restore(sess, tf.train.latest_checkpoint(path))

    env.set_agents(agents)
    _, wins = tournament(env, num)
    print(wins)
Esempio n. 7
0
def main():
    # Make environment
    env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4})
    iterations = 1

    # Set a global seed
    set_global_seed(0)

    # Set up agents
    agent = RandomAgent(action_num=env.action_num)
    env.set_agents([agent, agent])

    for it in range(iterations):

        # Generate data from the environment
        trajectories, payoffs = env.run(is_training=False)

        # Print out the trajectories
        print('\nIteration {}'.format(it))
        for ts in trajectories[0]:
            print(
                'State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.
                format(ts[0], ts[1], ts[2], ts[3], ts[4]))
Esempio n. 8
0
 def __init__(self):
     ''' Load random model
     '''
     env = rlcard.make('doudizhu')
     self.agent = RandomAgent(action_num=env.action_num)
     self.player_num = env.player_num
Esempio n. 9
0
''' A toy example of playing against a random agent on Limit Hold'em
'''

import rlcard
from rlcard.agents import LimitholdemHumanAgent as HumanAgent
from rlcard.agents import RandomAgent
from rlcard.utils.utils import print_card

# Make environment
env = rlcard.make('limit-holdem')
human_agent = HumanAgent(env.num_actions)
agent_0 = RandomAgent(num_actions=env.num_actions)
env.set_agents([human_agent, agent_0])

print(">> Limit Hold'em random agent")

while (True):
    print(">> Start a new game")

    trajectories, payoffs = env.run(is_training=False)
    # If the human does not take the final action, we need to
    # print other players action
    if len(trajectories[0]) != 0:
        final_state = trajectories[0][-1]
        action_record = final_state['action_record']
        state = final_state['raw_obs']
        _action_list = []
        for i in range(1, len(action_record) + 1):
            """
            if action_record[-i][0] == state['current_player']:
                break
Esempio n. 10
0
def train_uno():
    # Make environment
    env = rlcard.make("uno", config={"seed": 0})
    eval_env = rlcard.make("uno", config={"seed": 0})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 3000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 100

    # The paths for saving the logs and learning curves
    log_dir = "./experiments/uno_results_dqn/"

    # Set a global seed
    set_global_seed(0)

    params = {
        "scope": "DQN-Agent",
        "num_actions": env.action_num,
        "replay_memory_size": memory_init_size,
        "num_states": env.state_shape,
        "discount_factor": 0.99,
        "epsilon_start": 1.0,
        "epsilon_end": 0.1,
        "epsilon_decay_steps": 20000,
        "batch_size": 32,
        "train_every": 1,
        "mlp_layers": [512, 512],
        "lr": 0.0005,
    }

    agent_conf = DQN_conf(**params)
    agent = DQN_agent(agent_conf)

    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent])
    eval_env.set_agents([agent, random_agent])

    logger = Logger(log_dir)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            agent.feed(ts)

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0])

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot("DQN UNO")

    # Save model
    save_dir = "models/uno_dqn_pytorch"
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    state_dict = agent.get_state_dict()
    print(state_dict.keys())
    torch.save(state_dict, os.path.join(save_dir, "model.pth"))
def getAgent(agent_type, env):

    agent = None

    if agent_type == 'RandomAgent':
        agent = RandomAgent(action_num=env.action_num)

    elif agent_type == 'DDQNAgent':
        agent = DDQNAgent(
            action_num=env.action_num,
            state_shape=env.state_shape,
        )

    elif agent_type == 'A2CLSTMAgent':
        agent = A2CLSTMAgent(
            action_num=env.action_num,
            state_shape=env.state_shape,
            trainble=False,
            discount_factor=0.95,
            critic_lstm_layers=[1, 512],
            critic_mlp_layers=[3, 512],
            critic_activation_func='tanh',
            critic_kernel_initializer='glorot_uniform',
            critic_learning_rate=0.001,
            critic_bacth_size=128,
            actor_lstm_layers=[1, 512],
            actor_mlp_layers=[3, 512],
            actor_activation_func='tanh',
            actor_kernel_initializer='glorot_uniform',
            actor_learning_rate=0.0001,
            actor_bacth_size=512,
            entropy_coef=0.5,
            max_grad_norm=1,
        )

    elif agent_type == 'A2CQPGAgent':
        agent = A2CQPGAgent(
            action_num=env.action_num,
            state_shape=env.state_shape,
            trainble=False,
            discount_factor=0.95,
            critic_mlp_layers=[4, 512],
            critic_activation_func='tanh',
            critic_kernel_initializer='glorot_uniform',
            critic_learning_rate=0.001,
            critic_bacth_size=128,
            actor_mlp_layers=[4, 512],
            actor_activation_func='tanh',
            actor_kernel_initializer='glorot_uniform',
            actor_learning_rate=0.0001,
            actor_bacth_size=512,
            entropy_coef=1,
            max_grad_norm=1,
        )

    elif agent_type == 'A2CLSTMQPGAgent':
        agent = A2CLSTMQPGAgent(
            action_num=env.action_num,
            state_shape=env.state_shape,
            trainable=False,
            discount_factor=0.95,
            critic_lstm_layers=[1, 512],
            critic_mlp_layers=[3, 512],
            critic_activation_func='tanh',
            critic_kernel_initializer='glorot_uniform',
            critic_learning_rate=0.001,
            critic_bacth_size=128,
            actor_lstm_layers=[1, 512],
            actor_mlp_layers=[3, 512],
            actor_activation_func='tanh',
            actor_kernel_initializer='glorot_uniform',
            actor_learning_rate=0.0001,
            actor_bacth_size=512,
            entropy_coef=0.5,
            max_grad_norm=1,
        )

    elif agent_type == 'A2CAgent':
        agent = A2CAgent(
            action_num=env.action_num,
            state_shape=env.state_shape,
            discount_factor=0.95,
            critic_mlp_layers=[4, 512],
            critic_activation_func='tanh',
            critic_kernel_initializer='glorot_uniform',
            critic_learning_rate=0.001,
            critic_bacth_size=128,
            actor_mlp_layers=[4, 512],
            actor_activation_func='tanh',
            actor_kernel_initializer='glorot_uniform',
            actor_learning_rate=0.0001,
            actor_bacth_size=512,
            entropy_coef=1,
            max_grad_norm=1,
        )
    else:
        raise ValueError(str(agent_type) + ' type not exist')

    return agent
Esempio n. 12
0
        # Save model
        save_dir = 'models/uno_dqn'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.train.Saver()

        # Set up the agents
        agent = DQNAgent(sess,
                        scope='dqn',
                        action_num=env.action_num,
                        replay_memory_size=20000,
                        replay_memory_init_size=memory_init_size,
                        train_every=train_every,
                        state_shape=env.state_shape,
                        mlp_layers=[512,512])
        random_agent1 = RandomAgent(action_num=eval_env.action_num)
        random_agent2 = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, random_agent1, random_agent2])
        eval_env.set_agents([agent, random_agent1, random_agent2])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):
            print('Episode: ' + str(episode))

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)
Esempio n. 13
0
def train_uno():

    # Make environment
    env = rlcard.make('uno', config={'seed': 0})
    eval_env = rlcard.make('uno', config={'seed': 0})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 3000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    # The paths for saving the logs and learning curves
    log_dir = './experiments/uno_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_size=20000,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])
        random_agent = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, random_agent])
        eval_env.set_agents([agent, random_agent])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[0])

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('DQN')

        # Save model
        save_dir = 'models/uno_dqn'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.train.Saver()
        saver.save(sess, os.path.join(save_dir, 'model'))
Esempio n. 14
0
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
''' Another example of loading a pre-trained NFSP model on Leduc Hold'em
    Here, we directly load the model from model zoo
'''
import rlcard
from rlcard.agents import RandomAgent
from rlcard.utils import set_global_seed, tournament
from rlcard import models

# Make environment
env = rlcard.make('leduc-holdem', config={'seed': 0})

# Set a global seed
set_global_seed(0)

# Here we directly load NFSP models from /models module
nfsp_agents = models.load('leduc-holdem-nfsp').agents

# Evaluate the performance. Play with random agents.
evaluate_num = 10000
random_agent = RandomAgent(env.action_num)
env.set_agents([nfsp_agents[0], random_agent])
reward = tournament(env, evaluate_num)[0]
print('Average reward against random agent: ', reward)

Esempio n. 16
0
from rlcard.agents import RandomAgent
import rlcard
import numpy as np
env = rlcard.make('doudizhu')

env.set_agents([
    RandomAgent(env.action_num),
    RandomAgent(env.action_num),
    RandomAgent(env.action_num)
])

# 让他们进行一轮斗地主

a = 0
for i in range(1000):
    trans, _ = env.run(is_training=False)
    if (len(trans[0]) > a):
        print(len(trans[0]))
        a = len(trans[0])

print('')
Esempio n. 17
0
def main():
    
    parser = createParser()
    namespace = parser.parse_args(sys.argv[1:])
    
    #random seed
    random_seed = namespace.random_seed
    #names
    env_name = namespace.env_name
    env_num = 1
    test_name = namespace.test_name
    dir_name = str(env_name)+'_a2c_'+str(test_name)+str(random_seed)
    # Set the iterations numbers and how frequently we evaluate/save plot
    evaluate_every = namespace.evaluate_every
    evaluate_num = namespace.evaluate_num
    episode_num = namespace.episode_num
    # Train the agent every X steps
    train_every = namespace.train_every
    save_every = namespace.save_every
    
    
    # Make environment
    env_rand = rlcard.make(env_name, config={'seed': random_seed})
    eval_env = rlcard.make(env_name, config={'seed': random_seed})
        
    # The paths for saving the logs and learning curves
    log_dir = './experiments/rl/'+dir_name+'_result'
    
    # Save model
    save_dir = 'models/rl/'+dir_name+'_result'
    
    # Set a global seed
    set_global_seed(random_seed)
    
    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)
    # Set up the agents
    
    agent_rand = RandomAgent(action_num=eval_env.action_num)    
    
    agent_test = A2CLSTMQPGAgent(
                     action_num=eval_env.action_num,
                     state_shape=eval_env.state_shape,
                     
                     discount_factor=0.95,
                
                     critic_lstm_layers=[1,512],
                     critic_mlp_layers=[3,512],
                     critic_activation_func='tanh', 
                     critic_kernel_initializer='glorot_uniform',
                     critic_learning_rate=0.001,
                     critic_bacth_size=128,
                     
                     actor_lstm_layers=[1,512],
                     actor_mlp_layers=[3,512],
                     actor_activation_func='tanh', 
                     actor_kernel_initializer='glorot_uniform', 
                     actor_learning_rate=0.0001,
                     actor_bacth_size=512,
                     
                     entropy_coef=0.5,
                     entropy_decoy=math.pow(0.1/0.5, 1.0/(episode_num//train_every)),
                     
                     max_grad_norm = 1,)  
    
    if namespace.load_model is not None:
        agent_test.load_model(namespace.load_model)
    
    env_rand.set_agents([agent_test, agent_rand])
    
    eval_env.set_agents([agent_test, agent_rand])

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir+'/'+test_name)
    
    envs = [env_rand, 
            ]
    
    env_num = len(envs)
    for episode in range(episode_num // env_num):

        # Generate data from the
        for env in envs:
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent_test.feed(ts)
            
        if episode % (train_every // env_num) == 0:
            agent_test.train()
        
        if episode % (save_every // env_num) == 0 :
            # Save model
            if not os.path.exists(save_dir+'/'+test_name+str(episode*env_num)):
                os.makedirs(save_dir+'/'+test_name+str(episode*env_num))
            agent_test.save_model(save_dir+'/'+test_name+str(episode*env_num))
            
        # Evaluate the performance. Play with random agents.
        if episode % (evaluate_every // env_num) == 0:
            print('episode: ', episode*env_num)
            logger.log_performance(episode*env_num, tournament(eval_env, evaluate_num)[0])


    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot(dir_name)
         
    # Save model
    if not os.path.exists(save_dir+'/'+test_name+str(episode_num)):
        os.makedirs(save_dir+'/'+test_name+str(episode_num))
    agent_test.save_model(save_dir+'/'+test_name+str(episode_num))
Esempio n. 18
0
def main():
    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'seed': 0,
                          'env_num': 16,
                          'game_player_num': 4
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 0,
                               'env_num': 16
                           })

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 200000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.8

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])

        agent2 = NFSPAgent(sess,
                           scope='nfsp',
                           action_num=env.action_num,
                           state_shape=env.state_shape,
                           hidden_layers_sizes=[512, 512],
                           anticipatory_param=0.1,
                           min_buffer_size_to_learn=memory_init_size,
                           q_replay_memory_init_size=memory_init_size,
                           train_every=64,
                           q_train_every=64,
                           q_mlp_layers=[512, 512])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        save_dir = 'models/nolimit_holdem_dqn'
        saver = tf.train.Saver()
        #saver.restore(sess, os.path.join(save_dir, 'model'))

        random_agent = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, agent, agent2, random_agent])
        eval_env.set_agents([agent, agent2])

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):
            agent2.sample_episode_policy()
            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            for ts in trajectories[2]:
                agent2.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver.save(sess, os.path.join(save_dir, 'model_final'))
Esempio n. 19
0
import rlcard
from rlcard.agents import RandomAgent as RandomAgent
from rlcard.agents import BlackjackHumanAgent as HumanAgent
from rlcard.utils.utils import print_card

# Make environment
num_players = 2
env = rlcard.make(
    'blackjack',
    config={
        'game_num_players': num_players,
    },
)
human_agent = HumanAgent(env.num_actions)
random_agent = RandomAgent(env.num_actions)
env.set_agents([
    human_agent,
    random_agent,
])

print(">> Blackjack human agent")

while (True):
    print(">> Start a new game")

    trajectories, payoffs = env.run(is_training=False)
    # If the human does not take the final action, we need to
    # print other players action

    if len(trajectories[0]) != 0:
Esempio n. 20
0
def nfsp():
    import tensorflow as tf
    if tf.test.gpu_device_name():
        print('GPU found')
    else:
        print("No GPU found")

    #os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'game_player_num': 2,
                          'seed': 477
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 12,
                               'game_player_num': 2
                           })
    eval_env2 = rlcard.make('no-limit-holdem',
                            config={
                                'seed': 43,
                                'game_player_num': 2
                            })
    #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2})
    # Set the iterations numbers and how frequently we evaluate the performance

    # The intial memory size
    memory_init_size = 1000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_nfsp_result/no_all_in'

    # Set a global seed
    set_global_seed(477)

    graph = tf.Graph()
    tf.ConfigProto()
    sess = tf.Session(graph=graph)

    evaluate_every = 2048
    evaluate_num = 32
    episode_num = 24576

    # The intial memory size
    memory_init_size = 256

    # Train the agent every X steps
    train_every = 256
    agents = []
    with graph.as_default():
        """
        def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        """

        # Model1v1V3cp10good
        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(0),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.01,
                      sl_learning_rate=0.005,
                      q_epsilon_start=.7,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_size=80000,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every + 44,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(1),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.01,
                      sl_learning_rate=0.005,
                      q_epsilon_start=.7,
                      q_replay_memory_size=80000,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every + 44,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

    # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\iivan')
    print(
        '-------------------------------------------------------------------------------------'
    )
    # print(check_point_path)

    #todays project :)
    # https://stackoverflow.com/questions/33758669/running-multiple-tensorflow-sessions-concurrently
    with sess.as_default():
        with graph.as_default():
            # saver = tf.train.Saver()
            # saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

            global_step = tf.Variable(0, name='global_step', trainable=False)
            random_agent = RandomAgent(action_num=eval_env2.action_num)

            env.set_agents(agents)
            eval_env.set_agents([agents[0], random_agent])
            eval_env2.set_agents([random_agent, agents[1]])
            # eval_env3.set_agents([agents[1], random_agent])

            # Initialize global variables
            sess.run(tf.global_variables_initializer())

            # Init a Logger to plot the learning curve
            logger = Logger(log_dir)

            for episode in range(episode_num):
                print(episode, end='\r')
                #print('oh')

                # First sample a policy for the episode
                for agent in agents:
                    agent.sample_episode_policy()

                # Generate data from the environment
                trajectories, _ = env.run(is_training=True)
                # Feed transitions into agent memory, and train the agent
                for i in range(env.player_num):
                    for ts in trajectories[i]:
                        agents[i].feed(ts)

                # Evaluate the performance. Play with random agents.
                if episode % evaluate_every == 0:
                    logger.log(
                        '\n\n\n---------------------------------------------------------------\nTournament '
                        + str(episode / evaluate_every))
                    # tournament(eval_env2, 6)
                    # exploitability.exploitability(eval_env, agents[0], 500)

                    res = tournament(env, evaluate_num)
                    logger.log_performance(env.timestep, res[0])
                    res2 = tournament(eval_env, evaluate_num // 3)
                    logger.log_performance(env.timestep, res2[0])
                    res3 = tournament(eval_env2, evaluate_num // 3)
                    logger.log_performance(env.timestep, res3[0])
                    logger.log('' + str(episode_num) + " - " + str(episode) +
                               '\n')
                    logger.log(
                        '\n\n----------------------------------------------------------------'
                    )

                if episode % (evaluate_every) == 0 and not episode == 0:
                    save_dir = 'models/nolimit_holdem_nfsp/no_all_in/cp/' + str(
                        episode // evaluate_every)
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver = tf.train.Saver()
                    saver.save(sess, os.path.join(save_dir, 'model'))

            logger.log(
                '\n\n\n---------------------------------------------------------------\nTournament '
                + str(episode / evaluate_every))
            res = tournament(eval_env, evaluate_num)
            logger.log_performance(env.timestep, res[0])
            logger.log('' + str(episode_num) + " - " + str(episode))

            # Close files in the logger
            logger.close_files()

            # Plot the learning curve
            logger.plot('NFSP')

            # Save model
            save_dir = 'models/nolimit_holdem_nfsp/no_all_in'
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            saver = tf.train.Saver()
            saver.save(sess, os.path.join(save_dir, 'model'))
Esempio n. 21
0
def nfsp():
    import tensorflow as tf
    if tf.test.gpu_device_name():
        print('GPU found')
    else:
        print("No GPU found")

    #os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'record_action': False,
                          'game_player_num': 2
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 12,
                               'game_player_num': 2
                           })
    eval_env2 = rlcard.make('no-limit-holdem',
                            config={
                                'seed': 43,
                                'game_player_num': 2
                            })

    # Set the iterations numbers and how frequently we evaluate the performance

    # The intial memory size
    memory_init_size = 1000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_nfsp_result/1v1MCNFSPv3'

    # Set a global seed
    set_global_seed(0)

    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    evaluate_every = 1000
    evaluate_num = 250
    episode_num = 5000

    # The intial memory size
    memory_init_size = 1500

    # Train the agent every X steps
    train_every = 256
    agents = []
    with graph.as_default():

        # Model1v1V3cp10good
        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(0),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=.1,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(1),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.075,
                      rl_learning_rate=0.075,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every // 2,
                      q_train_every=train_every // 2,
                      q_mlp_layers=[512, 512]))

    # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\1v1MCNFSPv3\\cp\\10')
    print(
        '-------------------------------------------------------------------------------------'
    )
    # print(check_point_path)
    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            # saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

            global_step = tf.Variable(0, name='global_step', trainable=False)
            random_agent = RandomAgent(action_num=eval_env2.action_num)

            #easy_agent = nfsp_agents[0]
            print(agents)
            # print(nfsp_agents)
            env.set_agents(agents)
            eval_env.set_agents(agents)
            eval_env2.set_agents([agents[0], random_agent])

            # Initialize global variables
            sess.run(tf.global_variables_initializer())

            # Init a Logger to plot the learning curve
            logger = Logger(log_dir)

            for episode in range(episode_num):

                # First sample a policy for the episode
                for agent in agents:
                    agent.sample_episode_policy()
                table = []
                # Generate data from the environment
                trajectories, _ = env.run(is_training=True)

                # Feed transitions into agent memory, and train the agent
                for i in range(env.player_num):
                    for ts in trajectories[i]:
                        agents[i].feed(ts, table)

                # Evaluate the performance. Play with random agents.
                if episode % evaluate_every == 0:
                    logger.log(
                        '\n\n\n---------------------------------------------------------------\nTournament '
                        + str(episode / evaluate_every))
                    res = tournament(eval_env, evaluate_num)
                    res2 = tournament(eval_env2, evaluate_num // 4)
                    logger.log_performance(env.timestep, res[0])
                    logger.log_performance(env.timestep, res2[0])
                    logger.log('' + str(episode_num) + " - " + str(episode) +
                               '\n')
                    logger.log(
                        '\n\n----------------------------------------------------------------'
                    )

                if episode % (evaluate_every) == 0 and not episode == 0:
                    save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' + str(
                        episode // evaluate_every)
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver = tf.train.Saver()
                    saver.save(sess, os.path.join(save_dir, 'model'))

            logger.log(
                '\n\n\n---------------------------------------------------------------\nTournament '
                + str(episode / evaluate_every))
            res = tournament(eval_env, evaluate_num)
            logger.log_performance(env.timestep, res[0])
            logger.log('' + str(episode_num) + " - " + str(episode))

            # Close files in the logger
            logger.close_files()

            # Plot the learning curve
            logger.plot('NFSP')

            # Save model
            save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good'
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            saver = tf.train.Saver()
            saver.save(sess, os.path.join(save_dir, 'model'))
Esempio n. 22
0
# The intial memory size
memory_init_size = 100

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/blackjack_dqn_result/'

# Set a global seed
set_global_seed(0)



# Set up the agents
agent = RandomAgent(action_num=env.action_num)

env.set_agents([agent])
eval_env.set_agents([agent])

# Initialize global variables


# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):

    # Generate data from the environment
    trajectories, _ = env.run(is_training=True)
Esempio n. 23
0
from itertools import combinations, combinations_with_replacement
import rlcard
from rlcard.utils import set_global_seed
from rlcard.agents import RandomAgent
from yaniv_rl.models.yaniv_rule_models import YanivNoviceRuleAgent, YanivIntermediateRuleAgent
from yaniv_rl import utils
from rlcard.envs.registration import register, make

register(
    env_id='yaniv',
    entry_point='yaniv_rl.envs.yaniv:YanivEnv',
)

agents = [
    RandomAgent(488),
    YanivNoviceRuleAgent(),
    YanivIntermediateRuleAgent()
]

# Make environment
env = make('yaniv', config={'seed': 0})

eval_num = 10000

table = [[0 for i in range(3)] for i in range(3)]
for i in range(3):
    # player v player
    env.set_agents([agents[i], agents[i]])
    res = utils.tournament(env, eval_num)
    winrate = res['wins'][0] / eval_num
    table[i][i] = winrate
Esempio n. 24
0
''' A toy example of playing Whale with random agents
'''

import rlcard
from rlcard.agents import RandomAgent
from rlcard.utils import set_global_seed

# Make environment
env = rlcard.make('whale', config={'seed': 0, 'num_players':4})
episode_num = 5

# Set a global seed
set_global_seed(0)

# Set up agents
agent_0 = RandomAgent(action_num=env.action_num)
agent_1 = RandomAgent(action_num=env.action_num)
agent_2 = RandomAgent(action_num=env.action_num)
agent_3 = RandomAgent(action_num=env.action_num)
env.set_agents([agent_0, agent_1, agent_2, agent_3])

for episode in range(episode_num):

    # Generate data from the environment
    trajectories, _ = env.run(is_training=False)

    # Print out the trajectories
    print('\nEpisode {}'.format(episode))
    for ts in trajectories[0]:
        print('State: {}, Action: {}, Reward: {}, Next State: {}, Done: {}'.
              format(ts[0], ts[1], ts[2], ts[3], ts[4]))
Esempio n. 25
0
from itertools import combinations, combinations_with_replacement
import rlcard
from rlcard.utils import set_global_seed
from rlcard.agents import RandomAgent
from yaniv_rl.models.yaniv_rule_models import YanivNoviceRuleAgent, YanivIntermediateRuleAgent
from yaniv_rl import utils
from rlcard.envs.registration import register, make
import sys 
register(
    env_id='yaniv',
    entry_point='yaniv_rl.envs.yaniv:YanivEnv',
)


agents = [RandomAgent(488), YanivNoviceRuleAgent(), YanivIntermediateRuleAgent()]

# Make environment

eval_num = 10000
env = make('yaniv', config={'seed': 0, 'starting_player': 0})
table = [[0 for i in range(3)] for i in range(3)]
for i in range(3):
    # player v player
    env.set_agents([agents[i], agents[i]])
    res = utils.tournament(env, eval_num)
    print("{} vs {}: ".format(i, i), res)
    winrate = res['wins'][0] / eval_num
    table[i][i] = winrate
for agent_1, agent_2 in combinations(agents, 2):
    a1i = agents.index(agent_1)
    a2i = agents.index(agent_2)
Esempio n. 26
0
# with tf.Session(config=config) as sess:

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])
    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent, random_agent])
    eval_env.set_agents([agent, random_agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
Esempio n. 27
0
def main():
    wandb_config = wandb.config
    config = {}
    hyperparams = {}
    for key in wandb_config.keys():
        if key in default_config:
            config[key] = wandb_config[key]
        elif key in default_hyperparams:
            hyperparams[key] = wandb_config[key]

    # Make environment
    env = make("yaniv", config=config)
    eval_env = make("yaniv", config=config)

    agents = []
    for i in range(env.player_num):
        agent = NFSPAgent(scope="nfsp" + str(i),
                          action_num=env.action_num,
                          state_shape=env.state_shape,
                          device=torch.device("cuda"),
                          **hyperparams)
        agents.append(agent)
        if load_model is not None:
            state_dict = torch.load(load_model)
            policy_dict = state_dict[load_scope]
            agent.policy_network.load_state_dict(policy_dict)
            q_key = load_scope + "_dqn_q_estimator"
            agent._rl_agent.q_estimator.qnet.load_state_dict(state_dict[q_key])
            target_key = load_scope + "_dqn_target_estimator"
            agent._rl_agent.target_estimator.qnet.load_state_dict(
                state_dict[target_key])

    rule_agent = YanivNoviceRuleAgent(
        single_step=config["single_step_actions"])
    random_agent = RandomAgent(action_num=env.action_num)

    def agent_feed(agent, trajectories):
        for transition in trajectories:
            agent.feed(transition)

    def save_function(agent, model_dir):
        torch.save(agent.get_state_dict(),
                   os.path.join(model_dir, "model_{}.pth".format(i)))

    e = ExperimentRunner(
        env,
        eval_env,
        log_every=100,
        save_every=100,
        base_dir="yaniv_nfsp_pytorch",
        config=config,
        training_agent=agents[0],
        vs_agent=agents[1],
        feed_function=agent_feed,
        save_function=save_function,
    )

    e.run_training(
        episode_num=50000,
        eval_every=200,
        eval_vs=[random_agent, rule_agent],
        eval_num=100,
    )