class BaseAgent: def __init__(self, env=gym.make('Blackjack-v0'), log_dir=None): self._env = env self.q = defaultdict(lambda: np.zeros(self._env.action_space.n)) self.policy = None self.eval_policy = None self.log_dir = log_dir self.logger = Logger(self.log_dir, debug=False) # Adding run function to Gym env if isinstance(self._env, BlackjackEnv): def run(is_training=False): observation = self._env.reset() while True: if (self.eval_policy is None) or \ (observation not in self.eval_policy): action = np.random.choice( np.arange(env.action_space.n)) else: action = np.argmax(self.eval_policy[observation]) observation, reward, done, _ = self._env.step(action) if done: return _, np.asarray([int(reward)]) self._env.run = run self._env.player_num = 1 def train(self): pass def play(self, num_plays=NUM_HANDS): return tournament(self._env, num_plays) def plot_policy(self, save=False, save_path=None): assert self.policy is not None plot_policy(self.policy, save=save, save_path=save_path) def plot_value_function(self): assert self.policy is not None plot_value_function(self.q) def plot(self, algo_name): self.logger.plot(algo_name) @staticmethod def plot_avg(base_dir, algo_name): csv_path_list = [f"{base_dir}/{j}/performance.csv" for j in range(NUM_EXP)] label_names = [f"{algo_name}_{j}" for j in range(NUM_EXP)] plot_avg(csv_path_list, label_names, f"{algo_name}_Average", f"{base_dir}/avg_fig.png")
def dqn_run_experiments(): for i in range(NUM_EXP): # Make environment env = rlcard.make('blackjack', config={'seed': i}) eval_env = rlcard.make('blackjack', config={'seed': i}) # Set the iterations numbers and how frequently we evaluate/save plot # The initial memory size memory_init_size = 100 # Train the agent every X steps train_every = 1 # The paths for saving the logs and learning curves log_dir = f"{DQN_RES_DIR}/{i}" # Set up the agents agent = DQNAgent('dqn', action_num=env.action_num, replay_memory_init_size=memory_init_size, train_every=train_every, state_shape=env.state_shape, mlp_layers=[128, 256, 512], debug=True) env.set_agents([agent]) eval_env.set_agents([agent]) # Init a Logger to plot the learning curve logger = Logger(log_dir, debug=True) for episode in range(DQN_TRAINING_DURATION): # Generate data from the environment trajectories, _ = env.run(is_training=True) # Feed transitions into agent memory, and train the agent for ts in trajectories[0]: agent.feed(ts) # Evaluate the performance. Play with random agents. if episode % EVALUATE_EVERY == 0: logger.log_performance(env.timestep, tournament(eval_env, EVALUATE_NUM_OF_HANDS)[0]) # Close files in the logger # logger.close_files() # Plot the learning curve logger.plot(f"DQN_{i}") BaseAgent.plot_avg(DQN_RES_DIR, "DQN")