Beispiel #1
0
    for agent in agents:
        agent.sample_episode_policy()

    # Generate data from the environment
    trajectories, _ = env.run(is_training=True)

    # Feed transitions into agent memory, and train the agent
    for i in range(env.player_num):
        # update ray rl model
        for ts in trajectories[i]:
            agents[i].feed(ts)

    # Evaluate the performance. Play with random agents.
    if episode % evaluate_every == 0:
        logger.log(
            '\n\n\n---------------------------------------------------------------\nTournament '
            + str(episode / evaluate_every))
        # tournament(eval_env2, 6)
        # exploitability.exploitability(eval_env, agents[0], 500)

        res = tournament(env, evaluate_num)
        logger.log_performance(env.timestep, res[0])
        res2 = tournament(eval_env, evaluate_num // 3)
        logger.log_performance(env.timestep, res2[0])
        res3 = tournament(eval_env2, evaluate_num // 3)
        logger.log_performance(env.timestep, res3[0])
        logger.log('' + str(episode_num) + " - " + str(episode) + '\n')
        logger.log(
            '\n\n----------------------------------------------------------------'
        )
Beispiel #2
0
        # Feed transitions into agent memory, and train the agent
        for i in range(env.player_num):
            for ts in trajectories[i]:
                agents[i].feed(ts)

        # extra logging
        if episode % evaluate_every == 0:
            reward = 0
            reward2 = 0
            eval_episode = 0
            for eval_episode in range(evaluate_num):
                _, payoffs = eval_env.run(is_training=False)
                reward += payoffs[0]
                reward2 += payoffs[1]

            logger.log(
                "\n\n########## Evaluation {} ##########".format(episode))
            reward_text = "{}".format(float(reward) / evaluate_num)
            reward2_text = "{}".format(float(reward2) / evaluate_num)
            info = "Timestep: {} Average reward is {}, reward2 is {}".format(
                env.timestep, reward_text, reward2_text)
            logger.log(info)

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0])

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
from rlcard.utils import Logger
from eval_util import *

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_num = 100
emu_num = 50

log_dir = './experiments/doudizhu_mcts_vs_drqn_result/'
best_model_path = './models/doudizhu_train_drqn_as_L_vs_random_and_eval_vs_random_best.npy'

# Set a global seed

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

logger.log("MCTS-UCT VS DRQN")

env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True})

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
sess = tf.Session(config=config)

drqn_agent = DRQNAgent(sess,
                       scope='doudizhu_drqn',
                       action_num=env.action_num,
                       memory_init_size=3000,
                       memory_size=6000,
                       train_every_t=1,
                       state_shape=env.state_shape,
        # Feed transitions into agent memory, and train the agent

        for tss in trajectories[:1]:
            for ts in tss:
                agent.feed(ts)



        #print(episode)
        if episode % evaluate_every == 0:
            eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True})
            eval_env.set_agents([agent, SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0)])
            time_start = time.time()
            payoffs1 = general_tournament(eval_env,evaluate_num,False)
            logger.log("episode:{} time:{} landlord winrate:{}".format(episode,time.time()-time_start,payoffs1[0]))
            L_WR_logger.log_performance(episode,payoffs1[0])

            eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True})
            eval_env.set_agents([SRandomAgent(eval_env.action_num, seed=0), SRandomAgent(eval_env.action_num, seed=0),agent])
            time_start = time.time()
            payoffs2 = general_tournament(eval_env, evaluate_num, False)
            logger.log("episode:{} time:{} peasant winrate:{}".format(episode, time.time() - time_start, payoffs2[1]))
            P_WR_logger.log_performance(episode, payoffs2[1])

            #

            save_flag = False
            if payoffs1[0] > max_L_WR:
                max_L_WR = payoffs1[0]
                save_flag = True
Beispiel #5
0
from SeedRanomAgent import SRandomAgent
from rlcard.utils import Logger
from eval_util import *

# Set the iterations numbers and how frequently we evaluate/save plot
evaluate_num = 1000
emu_num = 50

log_dir = './experiments/doudizhu_random_vs_random_result/'

# Set a global seed

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

logger.log("Random VS Random")

# 地主
set_global_seed(0)
eval_env = rlcard.make('doudizhu', config={'seed': 0, 'allow_step_back': True})
eval_env.set_agents([
    SRandomAgent(eval_env.action_num, seed=0),
    SRandomAgent(eval_env.action_num, seed=0),
    SRandomAgent(eval_env.action_num, seed=0)
])

time_start = time.time()
logger.log("Random = landlord winrate:{} time:{}".format(
    general_tournament(eval_env, evaluate_num, True)[0],
    time.time() - time_start))
Beispiel #6
0
def nfsp():
    import tensorflow as tf
    if tf.test.gpu_device_name():
        print('GPU found')
    else:
        print("No GPU found")

    #os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'game_player_num': 2,
                          'seed': 477
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 12,
                               'game_player_num': 2
                           })
    eval_env2 = rlcard.make('no-limit-holdem',
                            config={
                                'seed': 43,
                                'game_player_num': 2
                            })
    #eval_env3 = rlcard.make('no-limit-holdem', config={'seed': 43, 'game_player_num': 2})
    # Set the iterations numbers and how frequently we evaluate the performance

    # The intial memory size
    memory_init_size = 1000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_nfsp_result/no_all_in'

    # Set a global seed
    set_global_seed(477)

    graph = tf.Graph()
    tf.ConfigProto()
    sess = tf.Session(graph=graph)

    evaluate_every = 2048
    evaluate_num = 32
    episode_num = 24576

    # The intial memory size
    memory_init_size = 256

    # Train the agent every X steps
    train_every = 256
    agents = []
    with graph.as_default():
        """
        def __init__(self,
                 sess,
                 scope,
                 action_num=4,
                 state_shape=None,
                 hidden_layers_sizes=None,
                 reservoir_buffer_capacity=int(1e6),
                 anticipatory_param=0.1,
                 batch_size=256,
                 train_every=1,
                 rl_learning_rate=0.1,
                 sl_learning_rate=0.005,
                 min_buffer_size_to_learn=1000,
                 q_replay_memory_size=30000,
                 q_replay_memory_init_size=1000,
                 q_update_target_estimator_every=1000,
                 q_discount_factor=0.99,
                 q_epsilon_start=0.06,
                 q_epsilon_end=0,
                 q_epsilon_decay_steps=int(1e6),
                 q_batch_size=256,
                 q_train_every=1,
                 q_mlp_layers=None,
                 evaluate_with='average_policy'):
        """

        # Model1v1V3cp10good
        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(0),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.01,
                      sl_learning_rate=0.005,
                      q_epsilon_start=.7,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_size=80000,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every + 44,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(1),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=0.01,
                      sl_learning_rate=0.005,
                      q_epsilon_start=.7,
                      q_replay_memory_size=80000,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every + 44,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

    # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\iivan')
    print(
        '-------------------------------------------------------------------------------------'
    )
    # print(check_point_path)

    #todays project :)
    # https://stackoverflow.com/questions/33758669/running-multiple-tensorflow-sessions-concurrently
    with sess.as_default():
        with graph.as_default():
            # saver = tf.train.Saver()
            # saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

            global_step = tf.Variable(0, name='global_step', trainable=False)
            random_agent = RandomAgent(action_num=eval_env2.action_num)

            env.set_agents(agents)
            eval_env.set_agents([agents[0], random_agent])
            eval_env2.set_agents([random_agent, agents[1]])
            # eval_env3.set_agents([agents[1], random_agent])

            # Initialize global variables
            sess.run(tf.global_variables_initializer())

            # Init a Logger to plot the learning curve
            logger = Logger(log_dir)

            for episode in range(episode_num):
                print(episode, end='\r')
                #print('oh')

                # First sample a policy for the episode
                for agent in agents:
                    agent.sample_episode_policy()

                # Generate data from the environment
                trajectories, _ = env.run(is_training=True)
                # Feed transitions into agent memory, and train the agent
                for i in range(env.player_num):
                    for ts in trajectories[i]:
                        agents[i].feed(ts)

                # Evaluate the performance. Play with random agents.
                if episode % evaluate_every == 0:
                    logger.log(
                        '\n\n\n---------------------------------------------------------------\nTournament '
                        + str(episode / evaluate_every))
                    # tournament(eval_env2, 6)
                    # exploitability.exploitability(eval_env, agents[0], 500)

                    res = tournament(env, evaluate_num)
                    logger.log_performance(env.timestep, res[0])
                    res2 = tournament(eval_env, evaluate_num // 3)
                    logger.log_performance(env.timestep, res2[0])
                    res3 = tournament(eval_env2, evaluate_num // 3)
                    logger.log_performance(env.timestep, res3[0])
                    logger.log('' + str(episode_num) + " - " + str(episode) +
                               '\n')
                    logger.log(
                        '\n\n----------------------------------------------------------------'
                    )

                if episode % (evaluate_every) == 0 and not episode == 0:
                    save_dir = 'models/nolimit_holdem_nfsp/no_all_in/cp/' + str(
                        episode // evaluate_every)
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver = tf.train.Saver()
                    saver.save(sess, os.path.join(save_dir, 'model'))

            logger.log(
                '\n\n\n---------------------------------------------------------------\nTournament '
                + str(episode / evaluate_every))
            res = tournament(eval_env, evaluate_num)
            logger.log_performance(env.timestep, res[0])
            logger.log('' + str(episode_num) + " - " + str(episode))

            # Close files in the logger
            logger.close_files()

            # Plot the learning curve
            logger.plot('NFSP')

            # Save model
            save_dir = 'models/nolimit_holdem_nfsp/no_all_in'
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            saver = tf.train.Saver()
            saver.save(sess, os.path.join(save_dir, 'model'))
Beispiel #7
0
def nfsp():
    import tensorflow as tf
    if tf.test.gpu_device_name():
        print('GPU found')
    else:
        print("No GPU found")

    #os.environ['TF_CPP_MIN_LOG_LEVEL']='2'

    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'record_action': False,
                          'game_player_num': 2
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 12,
                               'game_player_num': 2
                           })
    eval_env2 = rlcard.make('no-limit-holdem',
                            config={
                                'seed': 43,
                                'game_player_num': 2
                            })

    # Set the iterations numbers and how frequently we evaluate the performance

    # The intial memory size
    memory_init_size = 1000

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_nfsp_result/1v1MCNFSPv3'

    # Set a global seed
    set_global_seed(0)

    graph = tf.Graph()
    sess = tf.Session(graph=graph)

    evaluate_every = 1000
    evaluate_num = 250
    episode_num = 5000

    # The intial memory size
    memory_init_size = 1500

    # Train the agent every X steps
    train_every = 256
    agents = []
    with graph.as_default():

        # Model1v1V3cp10good
        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(0),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.1,
                      rl_learning_rate=.1,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every,
                      q_train_every=train_every,
                      q_mlp_layers=[512, 512]))

        agents.append(
            NFSPAgent(sess,
                      scope='nfsp' + str(1),
                      action_num=env.action_num,
                      state_shape=env.state_shape,
                      hidden_layers_sizes=[512, 512],
                      anticipatory_param=0.075,
                      rl_learning_rate=0.075,
                      min_buffer_size_to_learn=memory_init_size,
                      q_replay_memory_init_size=memory_init_size,
                      train_every=train_every // 2,
                      q_train_every=train_every // 2,
                      q_mlp_layers=[512, 512]))

    # check_point_path = os.path.join('models\\nolimit_holdem_nfsp\\1v1MCNFSPv3\\cp\\10')
    print(
        '-------------------------------------------------------------------------------------'
    )
    # print(check_point_path)
    with sess.as_default():
        with graph.as_default():
            saver = tf.train.Saver()
            # saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

            global_step = tf.Variable(0, name='global_step', trainable=False)
            random_agent = RandomAgent(action_num=eval_env2.action_num)

            #easy_agent = nfsp_agents[0]
            print(agents)
            # print(nfsp_agents)
            env.set_agents(agents)
            eval_env.set_agents(agents)
            eval_env2.set_agents([agents[0], random_agent])

            # Initialize global variables
            sess.run(tf.global_variables_initializer())

            # Init a Logger to plot the learning curve
            logger = Logger(log_dir)

            for episode in range(episode_num):

                # First sample a policy for the episode
                for agent in agents:
                    agent.sample_episode_policy()
                table = []
                # Generate data from the environment
                trajectories, _ = env.run(is_training=True)

                # Feed transitions into agent memory, and train the agent
                for i in range(env.player_num):
                    for ts in trajectories[i]:
                        agents[i].feed(ts, table)

                # Evaluate the performance. Play with random agents.
                if episode % evaluate_every == 0:
                    logger.log(
                        '\n\n\n---------------------------------------------------------------\nTournament '
                        + str(episode / evaluate_every))
                    res = tournament(eval_env, evaluate_num)
                    res2 = tournament(eval_env2, evaluate_num // 4)
                    logger.log_performance(env.timestep, res[0])
                    logger.log_performance(env.timestep, res2[0])
                    logger.log('' + str(episode_num) + " - " + str(episode) +
                               '\n')
                    logger.log(
                        '\n\n----------------------------------------------------------------'
                    )

                if episode % (evaluate_every) == 0 and not episode == 0:
                    save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good' + str(
                        episode // evaluate_every)
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver = tf.train.Saver()
                    saver.save(sess, os.path.join(save_dir, 'model'))

            logger.log(
                '\n\n\n---------------------------------------------------------------\nTournament '
                + str(episode / evaluate_every))
            res = tournament(eval_env, evaluate_num)
            logger.log_performance(env.timestep, res[0])
            logger.log('' + str(episode_num) + " - " + str(episode))

            # Close files in the logger
            logger.close_files()

            # Plot the learning curve
            logger.plot('NFSP')

            # Save model
            save_dir = 'models/nolimit_holdem_nfsp/1v1MCNFSPv3/cp/10/good'
            if not os.path.exists(save_dir):
                os.makedirs(save_dir)
            saver = tf.train.Saver()
            saver.save(sess, os.path.join(save_dir, 'model'))
Beispiel #8
0
                    m_avg.append(rl_loss)

        t.set_description("rl loss: {}, payoff: {}, epsilon: {}".format(
            round(m_avg.get(), 2), 
            round(payoff_avg.get(), 2), 
            round(agent.epsilons[min(agent.total_t, agent.epsilon_decay_steps-1)], 2)
            ), refresh=True)

        # q = env.agents[0].eval_step(state)[1]
        # probs = {ACTION_LIST[i]:round(q[i],3) for i in range(len(q)) if q[i] != -100}
        # probs = sorted(probs.items(), key=lambda x: x[1], reverse=True)
        # tqdm.write(str(probs))

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == evaluate_every - 1:
            logger.log_performance(env.timestep, tournament_tractor(eval_env, evaluate_num)[0])
            logger.log("rl loss: {}, payoff: {}, epsilon: {}".format(
                round(m_avg.get(), 2), 
                round(payoff_avg.get(), 2), 
                round(agent.epsilons[min(agent.total_t, agent.epsilon_decay_steps-1)], 2)
            ))
            saver.save(sess, os.path.join(save_dir, 'model'))

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('DQN')