def dqn_run_experiments(): for i in range(NUM_EXP): # Make environment env = rlcard.make('blackjack', config={'seed': i}) eval_env = rlcard.make('blackjack', config={'seed': i}) # Set the iterations numbers and how frequently we evaluate/save plot # The initial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = f"{DQN_RES_DIR}/{i}" # Set up the agents agent = DQNAgent('dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 256, 512], debug=True) env.set_agents([agent]) eval_env.set_agents([agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir, debug=True) for episode in range(DQN_TRAINING_DURATION): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % EVALUATE_EVERY == 0: logger.log_performance(env.timestep, tournament(eval_env, EVALUATE_NUM_OF_HANDS)[0]) # Close files in the logger # logger.close_files() # Plot the learning curve logger.plot(f"DQN_{i}") BaseAgent.plot_avg(DQN_RES_DIR, "DQN")
def train(self): for i in range(0, TRAINING_DURATION // EVALUATE_EVERY + 1): self.logger.log_performance( i * EVALUATE_EVERY, tournament(self._env, EVALUATE_NUM_OF_HANDS)[0]) # Best Alpha so far is 0.015 self.eval_policy, self.q = mc_control( self._env, q=self.q, to_train=EVALUATE_EVERY, already_trained=EVALUATE_EVERY * i, alpha=self.alpha, gamma=1.0, eps_start=1.0, eps_decay=0.99999, eps_min=0.015) self.policy = self.eval_policy
def play(self, num_plays=NUM_HANDS): return tournament(self._env, num_plays)
def train(self): for i in range(0, TRAINING_DURATION // EVALUATE_EVERY + 1): self.logger.log_performance( i * EVALUATE_EVERY, tournament(self._env, EVALUATE_NUM_OF_HANDS)[0]) self.policy = self.eval_policy