コード例 #1
0
class BaseAgent:

    def __init__(self, env=gym.make('Blackjack-v0'), log_dir=None):
        self._env = env
        self.q = defaultdict(lambda: np.zeros(self._env.action_space.n))
        self.policy = None
        self.eval_policy = None
        self.log_dir = log_dir
        self.logger = Logger(self.log_dir, debug=False)
        # Adding run function to Gym env
        if isinstance(self._env, BlackjackEnv):
            def run(is_training=False):
                observation = self._env.reset()
                while True:
                    if (self.eval_policy is None) or \
                            (observation not in self.eval_policy):
                        action = np.random.choice(
                            np.arange(env.action_space.n))
                    else:
                        action = np.argmax(self.eval_policy[observation])
                    observation, reward, done, _ = self._env.step(action)
                    if done:
                        return _, np.asarray([int(reward)])

            self._env.run = run
            self._env.player_num = 1

    def train(self):
        pass

    def play(self, num_plays=NUM_HANDS):
        return tournament(self._env, num_plays)

    def plot_policy(self, save=False, save_path=None):
        assert self.policy is not None
        plot_policy(self.policy, save=save, save_path=save_path)

    def plot_value_function(self):
        assert self.policy is not None
        plot_value_function(self.q)

    def plot(self, algo_name):
        self.logger.plot(algo_name)

    @staticmethod
    def plot_avg(base_dir, algo_name):
        csv_path_list = [f"{base_dir}/{j}/performance.csv" for j in
                         range(NUM_EXP)]
        label_names = [f"{algo_name}_{j}" for j in range(NUM_EXP)]
        plot_avg(csv_path_list, label_names, f"{algo_name}_Average",
                 f"{base_dir}/avg_fig.png")
コード例 #2
0
def dqn_run_experiments():
    for i in range(NUM_EXP):
        # Make environment
        env = rlcard.make('blackjack', config={'seed': i})
        eval_env = rlcard.make('blackjack', config={'seed': i})

        # Set the iterations numbers and how frequently we evaluate/save plot

        # The initial memory size
        memory_init_size = 100

        # Train the agent every X steps
        train_every = 1

        # The paths for saving the logs and learning curves
        log_dir = f"{DQN_RES_DIR}/{i}"

        # Set up the agents
        agent = DQNAgent('dqn',
                         action_num=env.action_num,
                         replay_memory_init_size=memory_init_size,
                         train_every=train_every,
                         state_shape=env.state_shape,
                         mlp_layers=[128, 256, 512],
                         debug=True)
        env.set_agents([agent])
        eval_env.set_agents([agent])


        # Init a Logger to plot the learning curve
        logger = Logger(log_dir, debug=True)

        for episode in range(DQN_TRAINING_DURATION):

            # Generate data from the environment
            trajectories, _ = env.run(is_training=True)

            # Feed transitions into agent memory, and train the agent
            for ts in trajectories[0]:
                agent.feed(ts)

            # Evaluate the performance. Play with random agents.
            if episode % EVALUATE_EVERY == 0:
                logger.log_performance(env.timestep, tournament(eval_env, EVALUATE_NUM_OF_HANDS)[0])

            # Close files in the logger
            # logger.close_files()

        # Plot the learning curve
        logger.plot(f"DQN_{i}")
    BaseAgent.plot_avg(DQN_RES_DIR, "DQN")