Ejemplo n.º 1
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    Q = q_func(input_arg, num_actions)
    Q_target = q_func(input_arg, num_actions)

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()
        if (t > learning_starts):
            action = select_epilson_greedy_action(Q, encoded_obs, t)
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)
        if (done):
            last_obs = env.reset()
        else:
            last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            #
            # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main.
            # Move to GPU if possible
            # done flag in loop   ---- SKIPPED IF DONE IS TRUE
            # clipping the error between -1 and 1   -- OK
            # backward the error meaning?
            # Suggestion for changing parameters - change exploration scehdule (main)
            #
            # Q.cuda()
            obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size=batch_size)
            states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            actions = Variable(torch.from_numpy(act_batch).long())
            rewards = Variable(torch.from_numpy(reward_batch).float())
            next_states = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype))
            if USE_CUDA:
                states = states.cuda()
                actions = actions.cuda()
                rewards = rewards.cuda()
                next_states = next_states.cuda()
            Q.train()
            Q_target.eval()
            predicted_rewards = Q(states).gather(1,
                                                 actions.unsqueeze(1))  #Q(s,a)
            next_max_Q = Q_target(next_states).detach().max(1)[
                0]  #.unsqueeze(1) #Q_target(s,a)
            next_Q_values = not_dones * next_max_Q
            target_Q_values = rewards + (gamma * next_Q_values)  #r + Q_target
            bellman_error = target_Q_values - predicted_rewards.squeeze(1)
            clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0)
            optimizer.zero_grad()
            predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1))
            optimizer.step()
            num_param_updates += +1
            if (num_param_updates % target_update_freq == 0):
                Q_target.load_state_dict(Q.state_dict())

            # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask):
            #     if(done == 1.0):
            #         continue
            #     obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True)
            #     next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False)
            #     current_Q = Q(obs)
            #     predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True)
            #     target_reward = Q_target(next_obs).data.max(1)[0]
            #     loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0)

            #     optimizer.zero_grad()
            #     # should be current.backward(d_error.data.unsqueeze(1))
            #     # but it crashes on misfitting dims
            #     predicted_reward.backward(loss.data.unsqueeze(1))

            #     optimizer.step()
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Ejemplo n.º 2
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data.unsqueeze(1))

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Ejemplo n.º 3
0
        d_error = clipped_bellman_error * -1.0
        # Clear previous gradients before backward pass
        optimizer.zero_grad()
        # run backward pass
        current_Q_values.backward(d_error.data.unsqueeze(1))

        # Perfom the update
        optimizer.step()
        num_param_updates += 1

        # Periodically update the target network by Q network to target Q network
        if num_param_updates % target_update_freq == 0:
            target_Q.load_state_dict(Q.state_dict())

    ### 4. Log progress and keep track of statistics
    episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
    if len(episode_rewards) > 0:
        mean_episode_reward = np.mean(episode_rewards[-100:])
    if len(episode_rewards) > 100:
        best_mean_episode_reward = max(best_mean_episode_reward,
                                       mean_episode_reward)

    Statistic["mean_episode_rewards"].append(mean_episode_reward)
    Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

    if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
        print("Timestep %d" % (t, ))
        print("mean reward (100 episodes) %f" % mean_episode_reward)
        print("best mean reward %f" % best_mean_episode_reward)
        print("episodes %d" % len(episode_rewards))
        print("exploration %f" % exploration.value(t))
Ejemplo n.º 4
0
 def stopping_criterion(env):
     # notice that here t is the number of steps of the wrapped env,
     # which is different from the number of steps in the underlying env
     return get_wrapper_by_name(
         env, "Monitor").get_total_steps() >= num_timesteps
Ejemplo n.º 5
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            return model(Variable(obs,
                                  volatile=True)).data.max(1)[1].view(1, 1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    rewards = []
    episodes = []
    reward_list = []
    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            #action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
            action = select_epilson_greedy_action(Q, recent_observations,
                                                  t)[0][0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))

        if t == 20000000:
            pylab.plot(episodes, reward_list, 'b')
            pylab.savefig("./save_graph/breakout_dqn.png")

        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            # current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            current_Q_values = Q(obs_batch).gather(
                1, act_batch.unsqueeze(1)
            ).squeeze(
            )  # squeeze the [batch_size x 1] Tensor to have a shape of batch_size
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # # Compute Bellman error
            # bellman_error = target_Q_values - current_Q_values
            # # clip the bellman error between [-1 , 1]
            # clipped_bellman_error = bellman_error.clamp(-1, 1)
            # # Note: clipped_bellman_delta * -1 will be right gradient
            # d_error = clipped_bellman_error * -1.0
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            loss.backward()
            # Clip the gradients to lie between -1 and +1
            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            reward_list.append(mean_episode_reward)
            episodes.append(episode_rewards)

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Ejemplo n.º 6
0
        q_func=DQN,
        optimizer_spec=optimizer_spec,
        exploration=exploration_schedule,
        stopping_criterion=stopping_criterion,
        replay_buffer_size=REPLAY_BUFFER_SIZE,
        batch_size=BATCH_SIZE,
        gamma=GAMMA,
        learning_starts=LEARNING_STARTS,
        learning_freq=LEARNING_FREQ,
        frame_history_len=FRAME_HISTORY_LEN,
        target_update_freq=TARGER_UPDATE_FREQ,
        statistics_file_name=run.statistics_file_name
    )


if __name__ == '__main__':
    # Get Atari games.
    benchmark = gym.benchmark_spec('Atari40M')

    # Change the index to select a different game.
    task = benchmark.tasks[3]

    # Run training
    seed = 0  # Use a seed of zero (you may want to randomize the seed!)
    env = get_env(task, seed)

    # do not record videos:
    get_wrapper_by_name(env, "Monitor").video_callable = lambda episode_id: False

    main(env, task.max_timesteps)
Ejemplo n.º 7
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration=LinearSchedule(1000000, 0.1),
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000,
                grad_norm_clipping=10):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env, t) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    grad_norm_clipping: float or None
        If not None gradients' norms are clipped to this value.
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    # optimizer_func = construct_optimizer_func(Q, optimizer_spec)
    optimizer = torch.optim.Adam(Q.parameters())

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # recent_observations: shape(img_h, img_w, frame_history_len) are input to to the model
        recent_observations = replay_buffer.encode_recent_observation(
        ).transpose(2, 0, 1)

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch.transpose(0, 3, 1, 2)).type(dtype) /
                255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch.transpose(
                    0, 3, 1, 2)).type(dtype) / 255.0)
            done_mask = torch.from_numpy(done_mask)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                done_mask = done_mask.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            # Compute next Q value, based on which acion gives max Q values
            next_max_Q_values = Variable(torch.zeros(batch_size).type(dtype))
            # # Detach variable from the current graph since we don't want gradients to propagated
            next_max_Q_values[done_mask == 0] = target_Q(
                next_obs_batch).detach().max(1)[0]
            # Compute Bellman error, use huber loss to mitigate outlier impact
            target_Q_values = rew_batch + (gamma * next_max_Q_values)
            bellman_error = F.smooth_l1_loss(current_Q_values, target_Q_values)

            # Construct and optimizer and clear previous gradients
            optimizer = optimizer_func(t)
            optimizer.zero_grad()

            # run backward pass and clip the gradient
            bellman_error.backward()
            nn.utils.clip_grad_norm(Q.parameters(), grad_norm_clipping)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            print("learning_rate %f" % optimizer_spec.lr_schedule.value(t))
            sys.stdout.flush()
Ejemplo n.º 8
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000,
                statistics_file_name="statistics.pkl"):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network

    statistics_file_name: str
        Where to store the statistics file
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    print("STATISTICS_FILE_NAME: {}".format(statistics_file_name))

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(
                torch_types.FloatTensor).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return random.randrange(num_actions)

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    policy_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q
    target_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q target
    target_net.load_state_dict(
        policy_net.state_dict())  # copies the state of policy Q into target

    ######

    # Construct policy_net network optimizer function
    optimizer = optimizer_spec.constructor(policy_net.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        # YOUR CODE HERE
        stored_frame_idx = replay_buffer.store_frame(last_obs)
        last_obs_encoded = replay_buffer.encode_recent_observation()
        action = select_epilson_greedy_action(policy_net, last_obs_encoded, t)

        obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(stored_frame_idx, action, reward, done)

        if done:
            obs = env.reset()

        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            sample = replay_buffer.sample(batch_size)
            obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample

            # convert batches to pytorch tensors:
            obs_batch = torch.from_numpy(obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            actions_batch = torch.from_numpy(actions_batch).to(device).type(
                torch_types.LongTensor)
            rewards_batch = torch.from_numpy(rewards_batch).to(device).type(
                torch_types.FloatTensor)
            non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type(
                torch_types.FloatTensor)

            # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html:

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken
            state_action_values = policy_net(obs_batch).gather(
                1, actions_batch.unsqueeze(1)).squeeze(1)

            # Compute V(s_{t+1}) for all next states.
            next_state_values = target_net(next_obs_batch).max(
                1)[0].detach() * non_final_mask
            # Compute the expected Q values
            expected_state_action_values = (next_state_values *
                                            gamma) + rewards_batch

            # Compute loss
            d_error = state_action_values - expected_state_action_values  # = -bellman_error
            d_error.clamp_(-1, 1)

            # Optimize the model
            optimizer.zero_grad()
            state_action_values.backward(d_error)
            optimizer.step()

            num_param_updates += 1
            # Periodically update target network:
            if num_param_updates % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts:
            print("Timestep %d" % (t, ))
            print("  mean reward (100 episodes) %f" % mean_episode_reward)
            print("  best mean reward %f" % best_mean_episode_reward)
            print("  episodes %d" % len(episode_rewards))
            print("  exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open(statistics_file_name, 'wb') as f:
                pickle.dump(Statistic, f)
Ejemplo n.º 9
0
    def train(self):
        num_param_updates = 0
        loss_acc_since_last_log = 0.0
        param_updates_since_last_log = 0
        num_episodes = 0

        state = self.env.reset()[..., np.newaxis]
        for t in tqdm(range(self.total_timesteps)):
            last_idx = self.memory.store_frame(state)
            recent_observations = self.memory.encode_recent_observation()

            # Choose random action if learning hasn't started yet
            if t > self.learning_start:
                action = self.select_epsilon_greedy_action(
                    recent_observations, t).item()
            else:
                action = random.randrange(self.num_actions)

            # Advance a step
            next_state, reward, done, _ = self.env.step(action)
            next_state = next_state[..., np.newaxis]

            # Store result in memory
            self.memory.store_effect(last_idx, action, reward, done)

            # Reset if done (life lost, due to atari wrapper)
            if done:
                next_state = self.env.reset()
                next_state = next_state[..., np.newaxis]
            state = next_state

            # Train network using experience replay when
            # memory is sufficiently large.
            if (t > self.learning_start and t % self.learning_freq == 0
                    and self.memory.can_sample(self.batch_size)):
                # Sample from replay buffer
                (
                    state_batch,
                    act_batch,
                    r_batch,
                    next_state_batch,
                    done_mask,
                ) = self.memory.sample(self.batch_size)
                state_batch = torch.from_numpy(state_batch).type(
                    self.dtype) / 255.0
                act_batch = torch.from_numpy(act_batch).long().to(self.device)
                r_batch = torch.from_numpy(r_batch).to(self.device)
                next_state_batch = (
                    torch.from_numpy(next_state_batch).type(self.dtype) /
                    255.0)
                not_done_mask = torch.from_numpy(1 - done_mask).type(
                    self.dtype)

                # Calculate current Q value
                current_Q_vals = self.Q(state_batch).gather(
                    1, act_batch.unsqueeze(1))

                # Calculate next Q value based on action that gives max Q vals
                next_max_Q = self.target_Q(next_state_batch).detach().max(
                    dim=1)[0]
                next_Q_vals = not_done_mask * next_max_Q

                # Calculate target of current Q values
                target_Q_vals = r_batch + (self.gamma * next_Q_vals)

                # Calculate loss and backprop
                loss = F.smooth_l1_loss(current_Q_vals.squeeze(),
                                        target_Q_vals)
                self.optimizer.zero_grad()
                loss.backward()
                for param in self.Q.parameters():
                    param.grad.data.clamp_(-1, 1)

                # Update weights
                self.optimizer.step()
                num_param_updates += 1

                # Store stats
                loss_acc_since_last_log += loss.item()
                param_updates_since_last_log += 1

                # Update target network periodically
                if num_param_updates % self.target_update_freq == 0:
                    self.target_Q.load_state_dict(self.Q.state_dict())

                # Save model checkpoint
                if num_param_updates % self.checkpoint_frequency == 0:
                    save_model_checkpoint(
                        self.Q,
                        self.optimizer,
                        t,
                        f"{self.out_dir}/checkpoints/{self.model_name}_{num_param_updates}",
                    )

                # Log progress
                if (num_param_updates % (self.log_freq // 2) == 0
                        and param_updates_since_last_log > 0):
                    self.writer.add_scalar(
                        "Mean Loss per Update (Updates)",
                        loss_acc_since_last_log / param_updates_since_last_log,
                        num_param_updates,
                    )
                    loss_acc_since_last_log = 0.0
                    param_updates_since_last_log = 0

                if num_param_updates % self.log_freq == 0:
                    wrapper = get_wrapper_by_name(self.env, "Monitor")
                    episode_rewards = wrapper.get_episode_rewards()
                    mean_reward = round(np.mean(episode_rewards[-101:-1]), 2)
                    sum_reward = np.sum(episode_rewards[-101:-1])
                    episode_lengths = wrapper.get_episode_lengths()
                    mean_duration = round(np.mean(episode_lengths[-101:-1]), 2)
                    sum_duration = np.sum(episode_lengths[-101:-1])

                    self.writer.add_scalar(
                        f"Mean Reward (epoch = {self.log_freq} updates)",
                        mean_reward,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Duration (epoch = {self.log_freq} updates)",
                        mean_duration,
                        num_param_updates // self.log_freq,
                    )
                    self.writer.add_scalar(
                        f"Mean Reward per Timestep (epoch = {self.log_freq} updates)",
                        round(sum_reward / sum_duration, 2),
                        num_param_updates // self.log_freq,
                    )

            if done:
                num_episodes += 1

        # Save model
        save_model(self.Q, f"{self.out_dir}/{self.model_name}.model")

        self.env.close()

        print(f"Number of Episodes: {num_episodes}")

        return self.Q
Ejemplo n.º 10
0
 def should_stop(self):
     return (get_wrapper_by_name(self.env, "Monitor").get_total_steps() >=
             self.max_steps)
Ejemplo n.º 11
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    print("running new version")
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    """ ---------------------------- OUR CODE ---------------------------- """
    Q = q_func(input_arg, num_actions)  # The parameters are random
    Qtag = q_func(input_arg, num_actions)
    if (USE_CUDA):
        Q.cuda()
        Qtag.cuda()
    Qtag.load_state_dict(Q.state_dict())

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)
    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
    """ ------------------------------------------------------------------ """

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    reward = None
    done = None
    info = None
    LOG_EVERY_N_STEPS = 10000

    startTime = time.time()

    for t in count():
        """ Tsuf: ---- Stuff for debigging times for various places --- """
        T1 = 0
        t1Tmp = 0
        T2 = 0
        t2Tmp = 0
        T3 = 0
        t3Tmp = 0
        T4 = 0
        t4Tmp = 0
        T5 = 0
        t5Tmp = 0
        T6 = 0
        t6Tmp = 0
        T7 = 0
        t7Tmp = 0
        T8 = 0
        t8Tmp = 0
        """ ----------------------------------------------------------- """
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

#if (t>1000000):
#    break
### 2. Step the env and store the transition

# At this point, "last_obs" contains the latest observation that was
# recorded from the simulator. Here, your code needs to store this
# observation and its outcome (reward, next observation, etc.) into
# the replay buffer while stepping the simulator forward one step.
# At the end of this block of code, the simulator should have been
# advanced one step, and the replay buffer should contain one more
# transition.
# Specifically, last_obs must point to the new latest observation.
# Useful functions you'll need to call:
# obs, reward, done, info = env.step(action)
# this steps the environment forward one step
# obs = env.reset()
# this resets the environment if you reached an episode boundary.
# Don't forget to call env.reset() to get a new observation if done
# is true!!
# Note that you cannot use "last_obs" directly as input
# into your network, since it needs to be processed to include context
# from previous frames. You should check out the replay buffer
# implementation in dqn_utils.py to see what functionality the replay
# buffer exposes. The replay buffer has a function called
# encode_recent_observation that will take the latest observation
# that you pushed into the buffer and compute the corresponding
# input that should be given to a Q network by appending some
# previous frames.
# Don't forget to include epsilon greedy exploration!
# And remember that the first time you enter this loop, the model
# may not yet have been initialized (but of course, the first step
# might as well be random, since you haven't trained your net...)
        """ -------------------------- OUR CODE -------------------------- """

        #store last_obs, and get latest obs's as the input for the n.n
        t1Tmp = time.time()
        cur_idx = replay_buffer.store_frame(last_obs)
        next_input = replay_buffer.encode_recent_observation()
        T1 += time.time() - t1Tmp

        #take random action or use the net
        t2Tmp = time.time()
        action = select_epilson_greedy_action(
            Q, next_input, t)  #the returned action is on the CPU
        T2 += time.time() - t2Tmp

        #see what happens after we take that action
        t3Tmp = time.time()
        last_obs, reward, done, info = env.step(
            action)  #the returned parameters are on the CPU
        T3 += time.time() - t3Tmp

        #     print(t)
        # env.render()
        #store the results on the replay buffer
        replay_buffer.store_effect(cur_idx, action, reward, done)  #on the CPU

        #if the simulation is done, reset the environment
        if (done):
            last_obs = env.reset()
        """ -------------------------------------------------------------- """

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            """ ------------------------ OUR CODE ------------------------ """

            #sample a batch of history samples
            t4Tmp = time.time()
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)  #on CPU

            obs_batch = torch.from_numpy(obs_batch).type(
                dtype) / 255.0  # When available, move the samples batch to GPU
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0  #GPU
            T4 += time.time() - t4Tmp

            #see which Q values the current network gives, for all obs's
            t5Tmp = time.time()
            inter_Qs = Q(
                Variable(obs_batch))  #input is on GPU, output is on GPU
            inter_Qs_chosen = Variable(
                torch.zeros(batch_size).type(dtype))  #GPU
            #take the action that was chosen before
            for i in range(batch_size):
                inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]]
            #take only the intermediate (non-terminal) obs's
            inter_idx = np.where(done_mask == False)[0]  #CPU
            inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :]
            T5 += time.time() - t5Tmp

            #see what the "target" (backuped) network says for the intermediate ones
            t6Tmp = time.time()
            inter_next_Qs = Qtag(
                Variable(inter_next_obs_batch,
                         volatile=True)).data.max(1)[0]  #All on GPU
            T6 += time.time() - t6Tmp

            #calculate the bellman errors
            t7Tmp = time.time()
            #for final obs's, the target is just the reward
            targets = torch.from_numpy(rew_batch).type(
                dtype)  #Moved rew_batch to GPU (as 'targets')
            for (i, idx) in enumerate(inter_idx):
                targets[idx] += gamma * inter_next_Qs[i]  #The bellman item
            # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!!   [on GPU]
            # for i in range(len(errors)):
            #     if errors[i]<-1:
            #         errors[i] = -1
            #     elif errors[i]>1:
            #         errors[i] = 1
            errors = inter_Qs_chosen.data - targets
            errors.clamp(-1, 1)
            T7 += time.time() - t7Tmp

            #train the network! (:
            t8Tmp = time.time()
            optimizer.zero_grad()
            inter_Qs_chosen.backward(
                errors)  #COULD BE WRONG WAY!!    [Everything is on GPU (: ]
            optimizer.step()
            T8 += time.time() - t8Tmp

            num_param_updates += 1
            if (num_param_updates % target_update_freq == 0):
                Qtag.load_state_dict(Q.state_dict())
            """ ---------------------------------------------------------- """

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)
        Statistic["running_times"].append(int(time.time() - startTime))

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            if (PRINT_TIMES):
                print("-----------------------")
                print(T1)
                print(T2)
                print(T3)
                print(T4)
                print(T5)
                print(T6)
                print(T7)
                print(T8)
                print("-----------------------")
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Ejemplo n.º 12
0
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    feature_tested,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000,
):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    # added bool_flag for double save statistic
    bool_flag = False
    STATS_FILE_NAME = 'statistics ' + feature_tested + '.pkl'

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE

    if USE_CUDA:
        Q = q_func(num_actions=num_actions).cuda()
        Q_target = q_func(num_actions=num_actions).cuda()
    else:
        Q = q_func(num_actions=num_actions)
        Q_target = q_func(num_actions=num_actions)

    Q_target.load_state_dict(Q.state_dict())

    # Check & load pretrained model
    if os.path.isfile('Q_params' + feature_tested + '.pkl'):
        print('Load Q parameters ...')
        Q.load_state_dict(torch.load('Q_params' + feature_tested + '.pkl'))

    if os.path.isfile('target_Q_params' + feature_tested + '.pkl'):
        print('Load target Q parameters ...')
        Q_target.load_state_dict(
            torch.load('target_Q_params' + feature_tested + '.pkl'))

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    Statistic = {
        "starting_Q_values": [],
        "mean_episode_rewards": [],
        "best_mean_episode_rewards": []
    }

    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')

    # load prev Stats
    start = 0
    if os.path.isfile(STATS_FILE_NAME):
        with open(STATS_FILE_NAME, 'rb') as f:
            Statistic = pickle.load(f)
            mean_episode_reward = Statistic["mean_episode_rewards"][-1]
            best_mean_episode_reward = Statistic["best_mean_episode_rewards"][
                -1]
            start = len(Statistic["mean_episode_rewards"])
            print('Load %s ...' % STATS_FILE_NAME)
    done = False

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 50000  #10000

    for t in count(start):
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            return Statistic
            break
        # couldnt handle stopping_criterion, this works:
        if t > 4500000:
            return Statistic
            break
        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        # YOUR CODE HERE
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()
        action = select_epilson_greedy_action(Q, encoded_obs, t)
        # if started a new game - store Q-func
        ######
        # done not initialized since last time

        if t > learning_starts and done:
            # a very expensive statistic - so don't log frequently
            with torch.no_grad():
                obs = torch.from_numpy(encoded_obs).type(dtype).unsqueeze(
                    0) / 255.0
                item = torch.max(Q(Variable(obs))).item()
            Statistic["starting_Q_values"].append(item)

        ######
        # this steps the environment forward one step
        last_obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)

        if done:
            last_obs = env.reset()

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            # 3.a sample a batch of transitions
            sample = replay_buffer.sample(batch_size)
            obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = sample

            # move variables to GPU if available

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype)) / 255.0
            action_batch = Variable(
                torch.from_numpy(action_batch).type(dtype).long().view(-1, 1))
            reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype)) / 255.0
            done_mask = Variable(torch.from_numpy(done_mask).type(dtype))

            # 3.b compute the Bellman error
            # evaluating the current and next Q-values
            state_action_values = Q(obs_batch).gather(1, action_batch)
            next_state_values = Q_target(next_obs_batch).detach()

            # maskout post terminal status Q-values
            masked_next_state_values = next_state_values.max(1)[0] * (
                1 - done_mask)
            # constructing the corresponding error
            expected_state_action_values = (masked_next_state_values *
                                            gamma) + reward_batch
            bellman_error = expected_state_action_values.unsqueeze(
                1) - state_action_values

            # clip the error between [-1,1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            optimizer.zero_grad()
            # multiply by -1 (since pytorch minimizes)
            state_action_values.backward(-clipped_bellman_error)

            # 3.c: train the model
            optimizer.step()

            # 3.d periodically update the target network
            num_param_updates += 1
            if num_param_updates % target_update_freq == 0:
                Q_target.load_state_dict(Q.state_dict())

            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Save the trained model
            torch.save(Q.state_dict(), 'Q_params' + feature_tested + '.pkl')
            torch.save(Q_target.state_dict(),
                       'target_Q_params' + feature_tested + '.pkl')
            # Dump statistics to pickle
            #double save
            if bool_flag:
                bool_flag = False
                with open(STATS_FILE_NAME, 'wb') as f:
                    pickle.dump(Statistic, f)
                    print("Saved to %s" % STATS_FILE_NAME)
            else:
                bool_flag = True
                with open('copy_' + STATS_FILE_NAME, 'wb') as f:
                    pickle.dump(Statistic, f)
                    print("Saved to %s" % 'copy_' + STATS_FILE_NAME)

            plt.clf()
            plt.xlabel('Num of Games')
            plt.ylabel('Q-values on starting state')
            plt.plot(range(len(Statistic["starting_Q_values"])),
                     Statistic["starting_Q_values"],
                     label='Q-values')
            plt.legend()
            plt.title(feature_tested)
            plt.savefig('Q-value-Performance' + feature_tested + '.png')

            plt.clf()
            plt.xlabel('Timesteps')
            plt.ylabel('Mean Reward (past 100 episodes)')
            num_items = len(Statistic["mean_episode_rewards"])
            plt.plot(range(num_items),
                     Statistic["mean_episode_rewards"],
                     label='mean reward')
            plt.plot(range(num_items),
                     Statistic["best_mean_episode_rewards"],
                     label='best mean rewards')
            plt.legend()
            plt.title(feature_tested)
            plt.savefig('DeepQ-Performance' + feature_tested + '.png')
Ejemplo n.º 13
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    Statistic['parameters'] = {
        'replay_buffer_size': replay_buffer_size,
        'batch_size': batch_size,
        'gamma': gamma,
        'frame_history_len': frame_history_len,
        'learning_starts': learning_starts,
        'learning_freq': learning_freq,
        'target_update_freq': target_update_freq,
        'name': env.env.unwrapped.spec.id
    }
    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    if USE_CUDA:
        Q = Q.cuda()
        target_Q = target_Q.cuda()

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    filename = 'statistics.pkl'

    # Google Drive
    try:
        import google.colab
        IN_COLAB = True
    except:
        IN_COLAB = False

    if IN_COLAB:
        run_in_colab_message()
        try:
            from google.colab import auth
            import logging
            from pydrive.auth import GoogleAuth
            from pydrive.drive import GoogleDrive
            from oauth2client.client import GoogleCredentials
            logging.getLogger('googleapicliet.discovery_cache').setLevel(
                logging.ERROR)
            auth.authenticate_user()
            gauth = GoogleAuth()
            gauth.credentials = GoogleCredentials.get_application_default()
            drive = GoogleDrive(gauth)
        except:
            pass

    iter_time = time()

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        idx = replay_buffer.store_frame(last_obs)
        enc_obs = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epilson_greedy_action(Q, enc_obs, t)
        else:
            action = torch.IntTensor([[random.randrange(num_actions)]])

        obs, reward, done, info = env.step(action)
        if done:
            obs = env.reset()

        replay_buffer.store_effect(idx, action, reward, done)

        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            #3.a
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) /
                                 255.,
                                 requires_grad=True)
            act_batch = Variable(torch.from_numpy(act_batch).type(torch.int64))
            rew_batch = Variable(torch.from_numpy(rew_batch).type(dtype),
                                 requires_grad=True)
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.,
                requires_grad=True)
            done_mask = Variable(torch.from_numpy(done_mask).type(torch.int64))

            if USE_CUDA:
                obs_batch = obs_batch.cuda()
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                next_obs_batch = next_obs_batch.cuda()
                done_mask = done_mask.cuda()

            # Q network
            val = Q(obs_batch).gather(dim=1, index=act_batch.unsqueeze(1))

            # Q target network
            with torch.no_grad():
                tar_val_t = target_Q(next_obs_batch).max(1)[0]
            tar_val = torch.addcmul(rew_batch, gamma,
                                    1 - done_mask.type(dtype), tar_val_t)

            # 3.b error calculate
            d_error = (tar_val - val.squeeze()).clamp_(-1, 1) * -1.
            # d_error = torch.pow((tar_val - val.squeeze()).clamp_(-1, 1), 2) * -1.

            # 3.c train Q network
            optimizer.zero_grad()
            val.backward(d_error.data.unsqueeze(1))
            optimizer.step()

            # 3.d update target network
            num_param_updates += 1
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print(f"Iteration time:{time()-iter_time:.2f}")
            iter_time = time()
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            filename = f"{t}" + 'statistics.pkl' if IN_COLAB else 'statistics.pkl'
            with open(filename, 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % filename)
            if IN_COLAB and t % (LOG_EVERY_N_STEPS * 10) == 0:
                try:
                    stat_pkl = drive.CreateFile()
                    stat_pkl.SetContentFile(filename)
                    stat_pkl.Upload()
                    print("Uploaded to drive")
                except Exception:
                    print("Exception during upload to drive")