Ejemplo n.º 1
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        from rlcard.agents import DQNAgent
        #tf.compat.v1.global_variables_initializer()
        #tf.compat.v1.local_variables_initializer()

        env = rlcard.make('uno')

        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        with self.graph.as_default():

            self.dqn_agents = []
            for i in range(env.player_num):
                agent = DQNAgent(self.sess,
                                 scope='dqn' + str(i),
                                 action_num=env.action_num,
                                 state_shape=env.state_shape,
                                 mlp_layers=[512, 512])
                self.dqn_agents.append(agent)

        check_point_path = os.path.join(ROOT_PATH, 'uno_dqn')
        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                #                saver = tf.train.Saver()
                saver.restore(self.sess,
                              tf.train.latest_checkpoint(check_point_path))
Ejemplo n.º 2
0
    def __init__(self):
        ''' Load pretrained model
        '''
        import tensorflow as tf
        self.graph = tf.Graph()

        # Mitigation for gpu memory issue
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        self.sess = tf.Session(graph=self.graph, config=config)

        env = rlcard.make('tractor')
        with self.graph.as_default():
            self.dqn_agents = []
            for i in range(1):
                agent = DQNAgent(self.sess,
                     scope='dqn',
                     action_num=env.action_num,
                     state_shape=env.state_shape,
                     mlp_layers=[2048,2048],
                     replay_memory_size=100000,
                     update_target_estimator_every=100,
                     discount_factor=0.5,
                     epsilon_start=1,
                     epsilon_end=0.1,
                     epsilon_decay_steps=100000,
                     batch_size=256,
                     learning_rate=0.00002,
                     use_rule_policy=False
                )
                self.dqn_agents.append(agent)

        check_point_path = os.path.join(TRACTOR_PATH, 'tractor_dqn_345k')

        with self.sess.as_default():
            with self.graph.as_default():
                saver = tf.train.Saver()
                saver.restore(self.sess, tf.train.latest_checkpoint(check_point_path))
Ejemplo n.º 3
0
    def __init__(self):
        ''' Load pretrained model
        '''
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)
        env = rlcard.make('limit-holdem')
        with self.graph.as_default():
            self.dqn_agents = []
            agent = DQNAgent(self.sess,
                              scope='dqn',
                              action_num=env.action_num,
                              replay_memory_init_size=1000,
                              train_every=1,
                              state_shape=env.state_shape,
                              mlp_layers=[512,512])
            self.dqn_agents.append(agent)

        check_point_path = os.path.join(ROOT_PATH, 'limit_holdem_dqn')
        with self.sess.as_default():
            with self.graph.as_default():
                print(2)
                saver = tf.train.Saver()
                saver.restore(self.sess,tf.train.latest_checkpoint(check_point_path))
Ejemplo n.º 4
0
def main():
    # Make environment
    env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0})
    eval_env = rlcard.make('blackjack', config={'env_num': 4, 'seed': 0})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 100
    evaluate_num = 10000
    iteration_num = 100000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    # The paths for saving the logs and learning curves
    log_dir = './experiments/blackjack_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.compat.v1.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[10, 10])
        env.set_agents([agent])
        eval_env.set_agents([agent])

        # Initialize global variables
        sess.run(tf.compat.v1.global_variables_initializer())

        # Initialize a Logger to plot the learning curve
        logger = Logger(log_dir)

        for iteration in range(iteration_num):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if iteration % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(eval_env, evaluate_num)[0])

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('DQN')

        # Save model
        save_dir = 'models/blackjack_dqn'
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver = tf.compat.v1.train.Saver()
        saver.save(sess, os.path.join(save_dir, 'model'))
Ejemplo n.º 5
0
# The paths for saving the logs and learning curves
log_dir = './experiments/uno_single_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.compat.v1.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[128, 128])
    # Initialize global variables
    sess.run(tf.compat.v1.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    state = env.reset()

    for timestep in range(timesteps):
        action = agent.step(state)
        next_state, reward, done = env.step(action)
        ts = (state, action, reward, next_state, done)
Ejemplo n.º 6
0
import tensorflow as tf
import os

# Make environment and enable human mode
# Set 'record_action' to True because we need it to print results
env = rlcard.make('limit-holdem', config={'record_action': True})
human_agent = HumanAgent(env.action_num)

with tf.Session() as sess:
    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=1000,
                     train_every=1,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])

    saver = tf.train.Saver()
    save_dir = 'models/limit_holdem_dqn'
    saver.restore(sess, os.path.join(save_dir, 'model'))

    env.set_agents([human_agent, agent])

    print(">> Limit Hold'em random agent")

    while (True):
        print(">> Start a new game")
# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/gin_rummy_dqn_result/'

# Set a global seed
set_global_seed(0)

agent = DQNAgent(
    scope='dqn',
    action_num=env.action_num,
    #replay_memory_size=20000,
    replay_memory_size=1000,
    #replay_memory_init_size=memory_init_size,
    replay_memory_init_size=500,
    train_every=train_every,
    #state_shape=env.state_shape,
    state_shape=[768],
    mlp_layers=[512, 512],
    device=torch.device('cpu'))

random_agent = RandomAgent(action_num=eval_env.action_num)
env.set_agents([agent, random_agent])
eval_env.set_agents([agent, random_agent])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):
    print('epi: ', episode)
Ejemplo n.º 8
0
# latest_ckpt = tf.train.latest_checkpoint(check_point_path)

sess = tf.Session(graph=graph, config=config)

env = rlcard.make('tractor')
with graph.as_default():
    dqn_agents = []
    for i in range(1):
        agent = DQNAgent(sess,
                scope='dqn',
                action_num=env.action_num,
                state_shape=env.state_shape,
                mlp_layers=[2048,2048],
                replay_memory_size=100000,
                update_target_estimator_every=100,
                discount_factor=0.5,
                epsilon_start=1,
                epsilon_end=0.1,
                epsilon_decay_steps=100000,
                batch_size=256,
                learning_rate=0.00002,
                use_rule_policy=False
        )
        dqn_agents.append(agent)

# check_point_path = os.path.join(TRACTOR_PATH, 'dqn_10k_blindcard')

with sess.as_default():
    with graph.as_default():
        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(check_point_path))
log_dir = './experiments/nolimit_holdem_dqn_result/'

# Set a global seed
# set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    with tf.variable_scope('agent1'):
        # Set up the agents
        agent1 = DQNAgent(sess,
                          scope='dqn',
                          action_num=env.action_num,
                          replay_memory_init_size=memory_init_size,
                          train_every=train_every,
                          state_shape=env.state_shape,
                          mlp_layers=[1280, 1280])

    with tf.variable_scope('agent2'):
        agent2 = DQNAgent(sess,
                          scope='dqn',
                          action_num=eval_env.action_num,
                          replay_memory_init_size=memory_init_size,
                          train_every=train_every,
                          state_shape=eval_env.state_shape,
                          mlp_layers=[512, 512])

    random_agent = RandomAgent(action_num=eval_env.action_num)
    human_agent = NolimitholdemHumanAgent(eval_env.action_num)
Ejemplo n.º 10
0
log_dir = './experiments/uno_dqn_result/'

# Set a global seed
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_size=20000,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[60, 60, 60, 60, 60],
                     batch_size=512)

    saver = tf.train.Saver()
    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent])
    eval_env.set_agents([agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # saver.restore(sess, "models/uno_dqn5/model")
Ejemplo n.º 11
0
def main():
    # Make environment
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('no-limit-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate performance
    evaluate_every = 5000
    selfplay_every = 25000
    evaluate_num = 10000
    iteration_num = 8000000

    # The intial memory size
    memory_init_size = 100

    # Train the agent every X steps
    train_every = 1

    agent = DQNAgent(num_actions=env.num_actions,
                     state_shape=env.state_shape[0],
                     mlp_layers=[64, 64, 64, 64],
                     device=device)

    agents = [agent, load_model("model.pth")]

    env.set_agents(agents)

    with Logger('./') as logger:
        for episode in range(iteration_num):

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                logger.log_performance(env.timestep,
                                       tournament(env, evaluate_num)[0])
            if episode % selfplay_every == 0:
                save_path = os.path.join('./', str(episode) + "model.pth")
                torch.save(agent, save_path)
                print('Model saved in', save_path)
                agents = [agent, load_model(str(episode) + "model.pth")]
                env.set_agents(agents)

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    #plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join('./', 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nlh_cfr_result/'

    # Set a global seed
    set_seed(0)
Ejemplo n.º 12
0
def main():
    # Make environment
    env = rlcard.make('no-limit-holdem',
                      config={
                          'seed': 0,
                          'env_num': 16,
                          'game_player_num': 4
                      })
    eval_env = rlcard.make('no-limit-holdem',
                           config={
                               'seed': 0,
                               'env_num': 16
                           })

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 1000
    episode_num = 200000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.8

    # The paths for saving the logs and learning curves
    log_dir = './experiments/nolimit_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])

        agent2 = NFSPAgent(sess,
                           scope='nfsp',
                           action_num=env.action_num,
                           state_shape=env.state_shape,
                           hidden_layers_sizes=[512, 512],
                           anticipatory_param=0.1,
                           min_buffer_size_to_learn=memory_init_size,
                           q_replay_memory_init_size=memory_init_size,
                           train_every=64,
                           q_train_every=64,
                           q_mlp_layers=[512, 512])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        save_dir = 'models/nolimit_holdem_dqn'
        saver = tf.train.Saver()
        #saver.restore(sess, os.path.join(save_dir, 'model'))

        random_agent = RandomAgent(action_num=eval_env.action_num)
        env.set_agents([agent, agent, agent2, random_agent])
        eval_env.set_agents([agent, agent2])

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        for episode in range(episode_num):
            agent2.sample_episode_policy()
            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            for ts in trajectories[2]:
                agent2.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        saver.save(sess, os.path.join(save_dir, 'model_final'))
Ejemplo n.º 13
0
def main():
    # Make environment
    env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4})
    eval_env = rlcard.make('leduc-holdem', config={'seed': 0, 'env_num': 4})

    # Set the iterations numbers and how frequently we evaluate the performance
    evaluate_every = 100
    evaluate_num = 10000
    episode_num = 800000

    # The intial memory size
    memory_init_size = 1000

    # Train the agent every X steps
    train_every = 1

    _reward_max = -0.5

    # The paths for saving the logs and learning curves
    log_dir = './experiments/leduc_holdem_dqn_result/'

    # Set a global seed
    set_global_seed(0)

    with tf.Session() as sess:

        # Initialize a global step
        global_step = tf.Variable(0, name='global_step', trainable=False)

        # Set up the agents
        agent = DQNAgent(sess,
                         scope='dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 128])
        # random_agent = RandomAgent(action_num=eval_env.action_num)
        cfr_agent = models.load('leduc-holdem-cfr').agents[0]
        env.set_agents([agent, agent])
        eval_env.set_agents([agent, cfr_agent])

        # Initialize global variables
        sess.run(tf.global_variables_initializer())

        # Init a Logger to plot the learning curve
        logger = Logger(log_dir)

        saver = tf.train.Saver()
        save_dir = 'models/leduc_holdem_dqn'
        saver.restore(sess, os.path.join(save_dir, 'model'))

        for episode in range(episode_num):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % evaluate_every == 0:
                _reward = tournament(eval_env, evaluate_num)[0]
                logger.log_performance(episode, _reward)
                if _reward > _reward_max:
                    # Save model
                    if not os.path.exists(save_dir):
                        os.makedirs(save_dir)
                    saver.save(sess, os.path.join(save_dir, 'model'))
                    _reward_max = _reward

        # Close files in the logger
        logger.close_files()

        # Plot the learning curve
        logger.plot('DQN')
Ejemplo n.º 14
0
# Make environment
env = rlcard.make('uno', config={'seed': 0})

# Set a global seed
set_global_seed(0)

# Load pretrained model
graph = tf.Graph()
sess = tf.Session(graph=graph)

with graph.as_default():
    dqn_agents = []
    for i in range(env.player_num):
        agent = DQNAgent(sess,
                         scope='dqn' + str(i),
                         action_num=env.action_num,
                         state_shape=env.state_shape,
                         mlp_layers=[512, 512])
        dqn_agents.append(agent)

# We have a pretrained model here. Change the path for your model.
check_point_path = os.path.join(rlcard.__path__[0],
                                'models/pretrained/uno_dqn')

with sess.as_default():
    with graph.as_default():
        saver = tf.train.Saver()
        saver.restore(sess, tf.train.latest_checkpoint(check_point_path))

# Evaluate the performance. Play with random agents.
evaluate_num = 1000
Ejemplo n.º 15
0
set_global_seed(0)

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(
        sess,
        scope='dqn',
        replay_memory_size=replay_memory_size,
        replay_memory_init_size=memory_init_size,
        update_target_estimator_every=update_target_estimator_every,
        discount_factor=discount_factor,
        epsilon_start=epsilon_start,
        epsilon_end=epsilon_end,
        epsilon_decay_steps=epsilon_decay_steps,
        batch_size=batch_size,
        action_num=env.action_num,
        state_shape=env.state_shape,
        train_every=train_every,
        mlp_layers=mlp_layers,
        learning_rate=learning_rate)

    random_agent = RandomAgent(action_num=eval_env.action_num)
    agent_list = [agent, random_agent, random_agent]  # default

    #deactivated at the moment because we might not need it if we use landlord score anyway for switching positions/roles

    if (landlord_score):
        agent_list = [agent, random_agent, random_agent]
Ejemplo n.º 16
0
memory_init_size = 1000

# Train the agent every X steps
train_every = 1

# The paths for saving the logs and learning curves
log_dir = './experiments/dqn_random_result/'

# Set a global seed
set_global_seed(0)

# Set up the agents
agent = DQNAgent(scope='dqn',
                 action_num=env.action_num,
                 replay_memory_init_size=memory_init_size,
                 train_every=train_every,
                 state_shape=env.state_shape,
                 mlp_layers=[128, 128],
                 device=torch.device('cpu'))
random_agent = RandomAgent(action_num=eval_env.action_num)

env.set_agents([agent, random_agent])
eval_env.set_agents([agent, random_agent])

# Init a Logger to plot the learning curve
logger = Logger(log_dir)

for episode in range(episode_num):

    # Generate data from the environment
    trajectories, _ = env.run(is_training=True)
Ejemplo n.º 17
0
def train(args):

    # Check whether gpu is available
    device = get_device()

    # Seed numpy, torch, random
    set_seed(args.seed)

    # Make the environment with seed
    env = rlcard.make(args.env, config={
        'seed': args.seed,
    })

    # Initialize the agent and use random agents as opponents
    if args.algorithm == 'dqn':
        from rlcard.agents import DQNAgent
        agent = DQNAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            mlp_layers=[64, 64],
            device=device,
        )
    elif args.algorithm == 'nfsp':
        from rlcard.agents import NFSPAgent
        agent = NFSPAgent(
            num_actions=env.num_actions,
            state_shape=env.state_shape[0],
            hidden_layers_sizes=[64, 64],
            q_mlp_layers=[64, 64],
            device=device,
        )
    agents = [agent]
    for _ in range(1, env.num_players):
        agents.append(RandomAgent(num_actions=env.num_actions))
    env.set_agents(agents)

    # Start training
    with Logger(args.log_dir) as logger:
        for episode in range(args.num_episodes):

            if args.algorithm == 'nfsp':
                agents[0].sample_episode_policy()

            # Generate data from the environment
            trajectories, payoffs = env.run(is_training=True)

            # Reorganaize the data to be state, action, reward, next_state, done
            trajectories = reorganize(trajectories, payoffs)

            # Feed transitions into agent memory, and train the agent
            # Here, we assume that DQN always plays the first position
            # and the other players play randomly (if any)
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % args.evaluate_every == 0:
                logger.log_performance(
                    env.timestep,
                    tournament(
                        env,
                        args.num_eval_games,
                    )[0])

        # Get the paths
        csv_path, fig_path = logger.csv_path, logger.fig_path

    # Plot the learning curve
    plot_curve(csv_path, fig_path, args.algorithm)

    # Save model
    save_path = os.path.join(args.log_dir, 'model.pth')
    torch.save(agent, save_path)
    print('Model saved in', save_path)
Ejemplo n.º 18
0
# Mitigation for gpu memory issue
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction = 0.9
# with tf.Session(config=config) as sess:

with tf.Session() as sess:

    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    agent = DQNAgent(sess,
                     scope='dqn',
                     action_num=env.action_num,
                     replay_memory_init_size=memory_init_size,
                     train_every=train_every,
                     state_shape=env.state_shape,
                     mlp_layers=[512, 512])
    random_agent = RandomAgent(action_num=eval_env.action_num)
    env.set_agents([agent, random_agent, random_agent])
    eval_env.set_agents([agent, random_agent, random_agent])

    # Initialize global variables
    sess.run(tf.global_variables_initializer())

    # Init a Logger to plot the learning curve
    logger = Logger(log_dir)

    for episode in range(episode_num):
Ejemplo n.º 19
0
with tf.Session(config=config) as sess:
    # Initialize a global step
    global_step = tf.Variable(0, name='global_step', trainable=False)

    # Set up the agents
    for i in range(1):
        agent = DQNAgent(sess,
                        scope='dqn' if i==0 else 'dqn' + str(i),
                        action_num=env.action_num,
                        replay_memory_init_size=memory_init_size,
                        train_every=train_every,
                        state_shape=env.state_shape,
                        mlp_layers=[2048,2048],
                        replay_memory_size=100000,
                        update_target_estimator_every=500,
                        discount_factor=0.99,
                        epsilon_start=0.1,
                        epsilon_end=0.1,
                        epsilon_decay_steps=100000,
                        batch_size=256,
                        learning_rate=0.00002,
                        use_rule_policy=False
                    )
        agents.append(agent)
    
    random_agent = RandomAgent(action_num=eval_env.action_num)
    rule_agent = TractorRuleAgent(action_num=eval_env.action_num)

    # 1 dqn agent vs 3 rule agent
    # env.set_agents([agent, rule_agent, rule_agent, rule_agent])