Example #1
0
def play_using_model(env, model, device, max_steps=10000, epsilon=0.05):
    model.eval()
    reward_acc = 0.0
    memory = ReplayBuffer(max_steps, 4)
    state = env.reset()[..., np.newaxis]
    for _step in tqdm(range(max_steps)):
        env.render()
        last_idx = memory.store_frame(state)
        recent_observations = memory.encode_recent_observation()
        if random.random() < epsilon:
            action = env.action_space.sample()
        else:
            obs = torch.from_numpy(recent_observations).to(device).unsqueeze(
                0) / 255.0
            with torch.no_grad():
                forward_res = model(obs)
                action = forward_res.argmax(dim=1).item()

        state, reward, done, _ = env.step(action)
        state = state[..., np.newaxis]
        memory.store_effect(last_idx, action, reward, done)

        reward_acc += reward

        if done:
            break

        time.sleep(0.05)

    logging.info(f"Total Reward: {reward_acc}")
    logging.info(f"Average Reward per Timestep: {reward_acc / _step}")
    logging.info(f"Timesteps: {_step}")
Example #2
0
    def eval_model(self, epoch, n=100):
        self.Q.eval()
        env = get_env(self.env_name, 6, monitor=False)
        rewards = []
        durations = []
        for _e in tqdm(range(n)):
            memory = ReplayBuffer(10000, self.frame_history_len)
            state = env.reset()[..., np.newaxis]
            reward_acc = 0.0
            for t in range(10000):
                if state is None:
                    break

                memory.store_frame(state)
                recent_observations = memory.encode_recent_observation()

                action = self.select_epsilon_greedy_action(
                    recent_observations, None, 0.05).item()
                state, reward, done, _ = env.step(action)

                if done:
                    state = env.reset()

                state = state[..., np.newaxis]
                reward_acc += reward

            durations.append(t)
        self.Q.train()
        sum_rewards = sum(rewards)
        sum_durations = sum(durations)
        self.writer.add_scalar(
            f"Mean Reward ({n} episodes)",
            round(sum_rewards / len(rewards), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Duration ({n} episodes)",
            round(sum_durations / len(durations), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Reward per Timestep ({n} episodes)",
            round(sum_rewards / sum_durations, 2),
            epoch,
        )
Example #3
0
def simulate_policy(env,
                    action_func,
                    num_episodes=NUM_EPISODES,
                    num_timesteps=NUM_TIMESTEPS):
    total_reward = 0
    total_timesteps = 0
    all_rewards_per_episode = []

    memory = ReplayBuffer(num_timesteps, CHANNELS)

    for i in range(num_episodes):
        state = env.reset()[..., np.newaxis]
        curr_reward = 0

        for t in range(num_timesteps):
            env.render()
            last_idx = memory.store_frame(state)
            action = action_func(memory, state)
            state, reward, done, _ = env.step(action)
            state = state[..., np.newaxis]
            memory.store_effect(last_idx, action, reward, done)

            curr_reward += reward
            total_timesteps += 1

            time.sleep(1 / FPS)
            if done:
                break
        total_reward += curr_reward

        curr_episode = i + 1
        reward_per_episode = total_reward / curr_episode
        reward_per_timestep = total_reward / total_timesteps
        timesteps_per_episode = total_timesteps / curr_episode
        all_rewards_per_episode.append(reward_per_episode)

        print_policy_statistics(reward_per_episode,
                                reward_per_timestep,
                                timesteps_per_episode,
                                wrapped_stdev(all_rewards_per_episode),
                                episode_num=curr_episode)

    return total_reward / num_episodes, total_reward / total_timesteps, total_timesteps / num_episodes, wrapped_stdev(
        all_rewards_per_episode)
Example #4
0
def learn(env,
          policy,
          q_func,
          optimizer_spec,
          session,
          stopping_criterion=None,
          replay_buffer_size=1000000,
          batch_size=32,
          gamma=0.99,
          learning_starts=50000,
          learning_freq=4,
          frame_history_len=4,
          target_update_freq=10000,
          grad_norm_clipping=10,
          lr_multiplier=1.0):
    """Run Deep Q-learning algorithm.
    You can specify your own convnet using q_func.
    All schedules are w.r.t. total number of steps taken in the environment.
    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            img_in: tf.Tensor
                tensorflow tensor representing the input image
            num_actions: int
                number of actions
            scope: str
                scope in which all the model related variables
                should be created
            reuse: bool
                whether previously created variables should be reused.
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    session: tf.Session
        tensorflow session to use.
    exploration: rl_algs.deepq.utils.schedules.Schedule
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_shape = env.observation_space.shape
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_shape = (img_h, img_w, frame_history_len * img_c)
    num_actions = env.action_space.n

    # set up placeholders
    # placeholder for current observation (or state)
    obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for current action
    act_t_ph = tf.placeholder(tf.int32, [None])
    # placeholder for current reward
    rew_t_ph = tf.placeholder(tf.float32, [None])
    # placeholder for next observation (or state)
    obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape))
    # placeholder for end of episode mask
    # this value is 1 if the next state corresponds to the end of an episode,
    # in which case there is no Q-value at the next state; at the end of an
    # episode, only the current state reward contributes to the target, not the
    # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1)
    done_mask_ph = tf.placeholder(tf.float32, [None])

    # casting to float on GPU ensures lower data transfer times.
    obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0
    obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0

    # Declare variables for logging
    t_log = []
    mean_reward_log = []
    best_mean_log = []
    episodes_log = []
    exploration_log = []
    learning_rate_log = []

    # Create a network to produce the current q values for each possible action
    current_q_func = q_func(obs_t_float,
                            num_actions,
                            scope="q_func",
                            reuse=False)  # Current Q-Value Function
    q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                    scope='q_func')

    # Creat the target q function network
    target_q_func = q_func(obs_tp1_float,
                           num_actions,
                           scope="target_q_func",
                           reuse=False)  # Target Q-Value Function
    target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                           scope='target_q_func')

    # Encode actions as as a one hot vector, based on the action that was chosen
    act_t = tf.one_hot(act_t_ph,
                       depth=num_actions,
                       dtype=tf.float32,
                       name="action_one_hot")
    q_act_t = tf.reduce_sum(act_t * current_q_func, axis=1)

    # Calculate the current reward, and use that to get the loss function
    y = rew_t_ph + gamma * tf.reduce_max(target_q_func, reduction_indices=[1])
    total_error = tf.square(tf.subtract(
        y, q_act_t))  #(reward + gamma*V(s') - Q(s, a))**2

    # construct optimization op (with gradient clipping)
    learning_rate = tf.placeholder(tf.float32, (), name="learning_rate")
    optimizer = optimizer_spec.constructor(learning_rate=learning_rate,
                                           **optimizer_spec.kwargs)
    train_fn = minimize_and_clip(optimizer,
                                 total_error,
                                 var_list=q_func_vars,
                                 clip_val=grad_norm_clipping)

    # update_target_fn will be called periodically to copy Q network to target Q network
    update_target_fn = []
    for var, var_target in zip(
            sorted(q_func_vars, key=lambda v: v.name),
            sorted(target_q_func_vars, key=lambda v: v.name)):
        update_target_fn.append(var_target.assign(var))
    update_target_fn = tf.group(*update_target_fn)

    # construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    model_initialized = False
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_EVERY_N_STEPS = 200000

    for t in itertools.count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator.
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.

        # Store last_obs into replay buffer
        idx = replay_buffer.store_frame(last_obs)

        if t == 0:
            act, reward, done = env.action_space.sample(), 0, False

        # Choose action
        if not model_initialized:
            # choose random action
            act = env.action_space.sample()
        else:
            input_batch = replay_buffer.encode_recent_observation()
            act = policy.select_action(current_q_func, input_batch, obs_t_ph)

        # Step simulator forward one step
        last_obs, reward, done, info = env.step(act)
        replay_buffer.store_effect(
            idx, act, reward,
            done)  # Store action taken after last_obs and corresponding reward

        if done == True:  # done was True in latest transition; we have already stored that
            last_obs = env.reset()  # Reset observation
            done = False

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # 3.b: initialize the model if it has not been initialized yet; to do
            # that, call
            #    initialize_interdependent_variables(session, tf.global_variables(), {
            #        obs_t_ph: obs_t_batch,
            #        obs_tp1_ph: obs_tp1_batch,
            #    })
            # where obs_t_batch and obs_tp1_batch are the batches of observations at
            # the current and next time step. The boolean variable model_initialized
            # indicates whether or not the model has been initialized.
            # Remember that you have to update the target network too (see 3.d)!
            # 3.c: train the model. To do this, you'll need to use the train_fn and
            # total_error ops that were created earlier: total_error is what you
            # created to compute the total Bellman error in a batch, and train_fn
            # will actually perform a gradient step and update the network parameters
            # to reduce total_error. When calling session.run on these you'll need to
            # populate the following placeholders:
            # obs_t_ph
            # act_t_ph
            # rew_t_ph
            # obs_tp1_ph
            # done_mask_ph
            # (this is needed for computing total_error)
            # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t)
            # (this is needed by the optimizer to choose the learning rate)
            # 3.d: periodically update the target network by calling
            # session.run(update_target_fn)
            # you should update every target_update_freq steps, and you may find the
            # variable num_param_updates useful for this (it was initialized to 0)
            #####

            # 3.a Sample a batch of transitions
            obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample(
                batch_size)

            # 3.b Initialize model if not initialized yet
            if not model_initialized:
                initialize_interdependent_variables(
                    session, tf.global_variables(), {
                        obs_t_ph: obs_t_batch,
                        obs_tp1_ph: obs_tp1_batch,
                    })
                session.run(update_target_fn)
                model_initialized = True

            # 3.c Train the model using train_fn and total_error
            session.run(
                train_fn, {
                    obs_t_ph: obs_t_batch,
                    act_t_ph: act_batch,
                    rew_t_ph: rew_batch,
                    obs_tp1_ph: obs_tp1_batch,
                    done_mask_ph: done_mask,
                    learning_rate: optimizer_spec.lr_schedule.value(t)
                })

            # 3.d Update target network every target_update_freq steps
            if t % target_update_freq == 0:
                session.run(update_target_fn)
                num_param_updates += 1
            #####

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and model_initialized:
            print("Timestep %d" % (t, ))
            t_log.append(t)
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            mean_reward_log.append(mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            best_mean_log.append(best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            episodes_log.append(len(episode_rewards))
            print("exploration %f" % policy.current_eps)
            exploration_log.append(policy.current_eps)
            print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
            learning_rate_log.append(optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush()

        if t % SAVE_EVERY_N_STEPS == 0 and model_initialized:
            training_log = ({
                't_log': t_log,
                'mean_reward_log': mean_reward_log,
                'best_mean_log': best_mean_log,
                'episodes_log': episodes_log,
                'exploration_log': exploration_log,
                'learning_rate_log': learning_rate_log
            })
            output_file_name = 'ram_lr' + str(lr_multiplier) + '_' + str(
                t) + '_data.pkl'
            with open(output_file_name, 'wb') as f:
                pickle.dump(training_log, f)
Example #5
0
class DQNAgent:
    def __init__(self, settings):
        self.check_settings(settings)

        # Constants
        self.batch_size = settings["batch_size"]
        self.checkpoint_frequency = settings["checkpoint_frequency"]
        self.device = settings["device"]
        self.dtype = (torch.cuda.FloatTensor
                      if self.device.type == "cuda" else torch.FloatTensor)
        self.env_name = settings["env"]
        self.env = get_env(settings["env"], 6)
        self.eps_cliff = settings["eps_cliff"]
        self.eps_start = settings["eps_start"]
        self.eps_end = settings["eps_end"]
        self.frame_history_len = settings["frame_history_len"]
        self.gamma = settings["gamma"]
        self.learning_freq = settings["learning_freq"]
        self.learning_start = settings["learning_start"]
        self.logs_dir = settings["logs_dir"]
        self.log_freq = settings["log_freq"]
        self.memory_size = settings["memory_size"]
        self.model_name = settings["model_name"]
        self.num_actions = self.env.action_space.n
        settings["num_actions"] = self.num_actions
        settings["num_channels"] = self.frame_history_len
        self.out_dir = settings["out_dir"]
        self.target_update_freq = settings["target_update_freq"]
        self.total_timesteps = settings["total_timesteps"]

        # Init models
        self.Q = DQN(settings).to(self.device)
        self.target_Q = DQN(settings).to(self.device)
        self.target_Q.load_state_dict(self.Q.state_dict())
        self.target_Q.eval()

        # Init model supporting objects
        self.memory = ReplayBuffer(self.memory_size, self.frame_history_len)
        self.optimizer = optim.RMSprop(self.Q.parameters(),
                                       lr=settings["lr"],
                                       alpha=0.95,
                                       eps=0.01)
        self.loss = F.smooth_l1_loss

        # Logging
        self.writer = SummaryWriter(self.logs_dir)

    def check_settings(self, settings):
        required_settings = [
            "batch_size",
            "checkpoint_frequency",
            "device",
            "env",
            "eps_start",
            "eps_end",
            "eps_cliff",
            "frame_history_len",
            "gamma",
            "learning_start",
            "log_freq",
            "logs_dir",
            "lr",
            "memory_size",
            "model_name",
            "out_dir",
            "target_update_freq",
            "total_timesteps",
        ]

        if not settings_is_valid(settings, required_settings):
            raise Exception(
                f"Settings object {settings} missing some required settings.")

    def _get_epsilon(self, steps_done):
        if steps_done < self.eps_cliff:
            epsilon = (-(self.eps_start - self.eps_end) / self.eps_cliff *
                       steps_done + self.eps_start)
        else:
            epsilon = self.eps_end
        return epsilon

    def select_epsilon_greedy_action(self, state, steps_done, epsilon=None):
        if epsilon is None:
            threshold = self._get_epsilon(steps_done)
        else:
            threshold = epsilon
        if random.random() < threshold:
            return torch.IntTensor([random.randrange(self.num_actions)])
        obs = torch.from_numpy(state).type(self.dtype).unsqueeze(0) / 255.0
        with torch.no_grad():
            return self.Q(obs).argmax(dim=1).cpu()  # returns action

    def should_stop(self):
        return (get_wrapper_by_name(self.env, "Monitor").get_total_steps() >=
                self.max_steps)

    def eval_model(self, epoch, n=100):
        self.Q.eval()
        env = get_env(self.env_name, 6, monitor=False)
        rewards = []
        durations = []
        for _e in tqdm(range(n)):
            memory = ReplayBuffer(10000, self.frame_history_len)
            state = env.reset()[..., np.newaxis]
            reward_acc = 0.0
            for t in range(10000):
                if state is None:
                    break

                memory.store_frame(state)
                recent_observations = memory.encode_recent_observation()

                action = self.select_epsilon_greedy_action(
                    recent_observations, None, 0.05).item()
                state, reward, done, _ = env.step(action)

                if done:
                    state = env.reset()

                state = state[..., np.newaxis]
                reward_acc += reward

            durations.append(t)
        self.Q.train()
        sum_rewards = sum(rewards)
        sum_durations = sum(durations)
        self.writer.add_scalar(
            f"Mean Reward ({n} episodes)",
            round(sum_rewards / len(rewards), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Duration ({n} episodes)",
            round(sum_durations / len(durations), 2),
            epoch,
        )
        self.writer.add_scalar(
            f"Mean Reward per Timestep ({n} episodes)",
            round(sum_rewards / sum_durations, 2),
            epoch,
        )

    def train(self):
        num_param_updates = 0
        loss_acc_since_last_log = 0.0
        param_updates_since_last_log = 0
        num_episodes = 0

        state = self.env.reset()[..., np.newaxis]
        for t in tqdm(range(self.total_timesteps)):
            last_idx = self.memory.store_frame(state)
            recent_observations = self.memory.encode_recent_observation()

            # Choose random action if learning hasn't started yet
            if t > self.learning_start:
                action = self.select_epsilon_greedy_action(
                    recent_observations, t).item()
            else:
                action = random.randrange(self.num_actions)

            # Advance a step
            next_state, reward, done, _ = self.env.step(action)
            next_state = next_state[..., np.newaxis]

            # Store result in memory
            self.memory.store_effect(last_idx, action, reward, done)

            # Reset if done (life lost, due to atari wrapper)
            if done:
                next_state = self.env.reset()
                next_state = next_state[..., np.newaxis]
            state = next_state

            # Train network using experience replay when
            # memory is sufficiently large.
            if (t > self.learning_start and t % self.learning_freq == 0
                    and self.memory.can_sample(self.batch_size)):
                # Sample from replay buffer
                (
                    state_batch,
                    act_batch,
                    r_batch,
                    next_state_batch,
                    done_mask,
                ) = self.memory.sample(self.batch_size)
                state_batch = torch.from_numpy(state_batch).type(
                    self.dtype) / 255.0
                act_batch = torch.from_numpy(act_batch).long().to(self.device)
                r_batch = torch.from_numpy(r_batch).to(self.device)
                next_state_batch = (
                    torch.from_numpy(next_state_batch).type(self.dtype) /
                    255.0)
                not_done_mask = torch.from_numpy(1 - done_mask).type(
                    self.dtype)

                # Calculate current Q value
                current_Q_vals = self.Q(state_batch).gather(
                    1, act_batch.unsqueeze(1))

                # Calculate next Q value based on action that gives max Q vals
                next_max_Q = self.target_Q(next_state_batch).detach().max(
                    dim=1)[0]
                next_Q_vals = not_done_mask * next_max_Q

                # Calculate target of current Q values
                target_Q_vals = r_batch + (self.gamma * next_Q_vals)

                # Calculate loss and backprop
                loss = F.smooth_l1_loss(current_Q_vals.squeeze(),
                                        target_Q_vals)
                self.optimizer.zero_grad()
                loss.backward()
                for param in self.Q.parameters():
                    param.grad.data.clamp_(-1, 1)

                # Update weights
                self.optimizer.step()
                num_param_updates += 1

                # Store stats
                loss_acc_since_last_log += loss.item()
                param_updates_since_last_log += 1

                # Update target network periodically
                if num_param_updates % self.target_update_freq == 0:
                    self.target_Q.load_state_dict(self.Q.state_dict())

                # Save model checkpoint
                if num_param_updates % self.checkpoint_frequency == 0:
                    save_model_checkpoint(
                        self.Q,
                        self.optimizer,
                        t,
                        f"{self.out_dir}/checkpoints/{self.model_name}_{num_param_updates}",
                    )

                # Log progress
                if (num_param_updates % (self.log_freq // 2) == 0
                        and param_updates_since_last_log > 0):
                    self.writer.add_scalar(
                        "Mean Loss per Update (Updates)",
                        loss_acc_since_last_log / param_updates_since_last_log,
                        num_param_updates,
                    )
                    loss_acc_since_last_log = 0.0
                    param_updates_since_last_log = 0

                if num_param_updates % self.log_freq == 0:
                    wrapper = get_wrapper_by_name(self.env, "Monitor")
                    episode_rewards = wrapper.get_episode_rewards()
                    mean_reward = round(np.mean(episode_rewards[-101:-1]), 2)
                    sum_reward = np.sum(episode_rewards[-101:-1])
                    episode_lengths = wrapper.get_episode_lengths()
                    mean_duration = round(np.mean(episode_lengths[-101:-1]), 2)
                    sum_duration = np.sum(episode_lengths[-101:-1])

                    self.writer.add_scalar(
                        f"Mean Reward (epoch = {self.log_freq} updates)",
                        mean_reward,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Duration (epoch = {self.log_freq} updates)",
                        mean_duration,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Reward per Timestep (epoch = {self.log_freq} updates)",
                        round(sum_reward / sum_duration, 2),
                        num_param_updates // self.log_freq,
                    )

            if done:
                num_episodes += 1

        # Save model
        save_model(self.Q, f"{self.out_dir}/{self.model_name}.model")

        self.env.close()

        print(f"Number of Episodes: {num_episodes}")

        return self.Q