コード例 #1
0
    eval_env.set_agents([agent, random_agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            agent.feed(ts)

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0])

    # Close files in the logger
    logger.close_files()

    # Plot the learning curve
    logger.plot('DQN')

    # Save model
    save_dir = 'models/doudizhu_dqn'
    if not os.path.exists(save_dir):
コード例 #2
0
    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger_mcts = Logger(log_dir_mcts)
    logger_dqn = Logger(log_dir_dqn)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
        for ts in trajectories[0]:
            dqn_agent.feed(ts)

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger_mcts.log_performance(env.timestep,
                                        tournament(eval_env, evaluate_num)[0])
            logger_dqn.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[1])

    # Close files in the logger
    logger_mcts.close_files()
    logger_dqn.close_files()

    # Plot the learning curve
    logger_mcts.plot('MCTS')
    logger_dqn.plot('DQN')
コード例 #3
0
def main():
    # Make environment
    env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0})
    eval_env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 100
    evaluate_num = 10000
    iteration_num = 100000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    # The paths for saving the logs and learning curves
    log_dir = './experiments/blackjack_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.compat.v1.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[10, 10])
        env.set_agents([agent])
        eval_env.set_agents([agent])

        # Initialize global variables
        sess.run(tf.compat.v1.global_variables_initializer())

        # Initialize a Logger to plot the learning curve
        logger = Logger(log_dir)

        for iteration in range(iteration_num):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if iteration % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[0])

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('DQN')

        # Save model
        save_dir = 'models/blackjack_dqn'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.compat.v1.train.Saver()
        saver.save(sess, os.path.join(save_dir, 'model'))
コード例 #4
0
ファイル: run_rl.py プロジェクト: billh0420/rlcard
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
コード例 #5
0
    eval_env.set_agents([agent1, agent2])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):

        # Generate data from the environment
        trajectories, _ = env.run(is_training=True)

        # Feed transitions into agent memory, and train the agent
        for ts1 in trajectories[0]:
            agent1.feed(ts1)

        for ts2 in trajectories[1]:
            agent2.feed(ts2)

        # Evaluate the performance. Play with random agents.
        if episode % evaluate_every == 0:
            logger.log_performance(env.timestep,
                                   tournament(eval_env, evaluate_num)[0])

    env.set_agents([agent1, random_agent])
    eval_env.set_agents([agent1, random_agent])
    # env.game_over_print = True
    # eval_env.game_over_print = True

    for episode in range(episode_num // 4):
コード例 #6
0
ファイル: selfplay.py プロジェクト: dpmaloney/PokerGuyZero
def main():
    # Make environment
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 5000
    selfplay_every = 25000
    evaluate_num = 10000
    iteration_num = 8000000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    agent = DQNAgent(num_actions=env.num_actions,
                     state_shape=env.state_shape[0],
                     mlp_layers=[64, 64, 64, 64],
                     device=device)

    agents = [agent, load_model("model.pth")]

    env.set_agents(agents)

    with Logger('./') as logger:
        for episode in range(iteration_num):

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(env, evaluate_num)[0])
            if episode % selfplay_every == 0:
                save_path = os.path.join('./', str(episode) + "model.pth")
                torch.save(agent, save_path)
                print('Model saved in', save_path)
                agents = [agent, load_model(str(episode) + "model.pth")]
                env.set_agents(agents)

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    #plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join('./', 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nlh_cfr_result/'

    # Set a global seed
    set_seed(0)
コード例 #7
0
ファイル: nolimit_holdem_dqn.py プロジェクト: LJQCN101/rlcard
def main():
    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'seed': 0,
                          'env_num': 16,
                          'game_player_num': 4
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 0,
                               'env_num': 16
                           })

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 200000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.8

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])

        agent2 = NFSPAgent(sess,
                           scope='nfsp',
                           action_num=env.action_num,
                           state_shape=env.state_shape,
                           hidden_layers_sizes=[512, 512],
                           anticipatory_param=0.1,
                           min_buffer_size_to_learn=memory_init_size,
                           q_replay_memory_init_size=memory_init_size,
                           train_every=64,
                           q_train_every=64,
                           q_mlp_layers=[512, 512])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        save_dir = 'models/nolimit_holdem_dqn'
        saver = tf.train.Saver()
        #saver.restore(sess, os.path.join(save_dir, 'model'))

        random_agent = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, agent, agent2, random_agent])
        eval_env.set_agents([agent, agent2])

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):
            agent2.sample_episode_policy()
            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            for ts in trajectories[2]:
                agent2.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver.save(sess, os.path.join(save_dir, 'model_final'))
コード例 #8
0
def main():
    # Make environment
    env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 10000
    episode_num = 800000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.5

    # The paths for saving the logs and learning curves
    log_dir = './experiments/leduc_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 128])
        # random_agent = RandomAgent(action_num=eval_env.action_num)
        cfr_agent = models.load('leduc-holdem-cfr').agents[0]
        env.set_agents([agent, agent])
        eval_env.set_agents([agent, cfr_agent])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        saver = tf.train.Saver()
        save_dir = 'models/leduc_holdem_dqn'
        saver.restore(sess, os.path.join(save_dir, 'model'))

        for episode in range(episode_num):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    # Save model
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('DQN')