Esempio n. 1
0
def dqn_run_experiments():
    for i in range(NUM_EXP):
        # Make environment
        env = rlcard.make('blackjack', config={'seed': i})
        eval_env = rlcard.make('blackjack', config={'seed': i})

        # Set the iterations numbers and how frequently we evaluate/save plot

        # The initial memory size
        memory_init_size = 100

        # Train the agent every X steps
        train_every = 1

        # The paths for saving the logs and learning curves
        log_dir = f"{DQN_RES_DIR}/{i}"

        # Set up the agents
        agent = DQNAgent('dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 256, 512],
                         debug=True)
        env.set_agents([agent])
        eval_env.set_agents([agent])


        # Init a Logger to plot the learning curve
        logger = Logger(log_dir, debug=True)

        for episode in range(DQN_TRAINING_DURATION):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % EVALUATE_EVERY == 0:
                logger.log_performance(env.timestep, tournament(eval_env, EVALUATE_NUM_OF_HANDS)[0])

            # Close files in the logger
            # logger.close_files()

        # Plot the learning curve
        logger.plot(f"DQN_{i}")
    BaseAgent.plot_avg(DQN_RES_DIR, "DQN")
Esempio n. 2
0
 def train(self):
     for i in range(0, TRAINING_DURATION // EVALUATE_EVERY + 1):
         self.logger.log_performance(
             i * EVALUATE_EVERY,
             tournament(self._env, EVALUATE_NUM_OF_HANDS)[0])
         # Best Alpha so far is 0.015
         self.eval_policy, self.q = mc_control(
             self._env,
             q=self.q,
             to_train=EVALUATE_EVERY,
             already_trained=EVALUATE_EVERY * i,
             alpha=self.alpha,
             gamma=1.0,
             eps_start=1.0,
             eps_decay=0.99999,
             eps_min=0.015)
     self.policy = self.eval_policy
Esempio n. 3
0
 def play(self, num_plays=NUM_HANDS):
     return tournament(self._env, num_plays)
Esempio n. 4
0
 def train(self):
     for i in range(0, TRAINING_DURATION // EVALUATE_EVERY + 1):
         self.logger.log_performance(
             i * EVALUATE_EVERY,
             tournament(self._env, EVALUATE_NUM_OF_HANDS)[0])
     self.policy = self.eval_policy