Exemple #1
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data.unsqueeze(1))

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000,
    num_actions1=31,
    num_actions2=27
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ###############
    # BUILD MODEL #
    ###############

    img_h, img_w, img_c = 32, 120, 1
    input_arg = frame_history_len * img_c

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0)
            # Use volatile = True if variable is only used in inference mode, i.e. don't save the history
            out1, out2 = model(Variable(obs))
            out1 = out1.max(1)[1].data.cpu().numpy()[0]
            out2 = out2.max(1)[1].data.cpu().numpy()[0]
            return out1, out2
        else:
            return random.randrange(num_actions1), random.randrange(num_actions2)

    # Initialize target q function and q function
    Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype)
    target_Q = q_func(num_actions1, num_actions2).cuda(0).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    epoch_reward = []
    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action1, action2 = select_epilson_greedy_action(Q, recent_observations, t)
        else:
            action1, action2 = random.randrange(num_actions1), random.randrange(num_actions2)
        # Advance one step
        obs, reward, done = env.step(action1, action2)
        epoch_reward.append(reward)
        if done:
            env.render()
        # clip rewards between -1 and 1
        # reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action1, action2, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
            print np.mean(epoch_reward)
            epoch_reward = []
            torch.save(Q,'../../weights/Q' + str(num_actions1) + '.pt')
            torch.save(target_Q,'../../weights/target_Q' + str(num_actions1) + '.pt')
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act1_batch, act2_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype))
            act1_batch = Variable(torch.from_numpy(act1_batch).long())
            act2_batch = Variable(torch.from_numpy(act2_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype))
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act1_batch = act1_batch.cuda()
                act2_batch = act2_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only stateif stopping_criterion is not None and stopping_criterion(env):
            # break and output value for every state-action pair
            # We choose Q based on action taken.
            q1, q2 =   Q(obs_batch)
            current_Q1_values = q1.gather(1, act1_batch.unsqueeze(1))
            current_Q2_values = q2.gather(1, act2_batch.unsqueeze(1))
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            tq1, tq2 = target_Q(next_obs_batch)
            next_max_q1 = tq1.detach().max(1)[0]
            next_max_q2 = tq2.detach().max(1)[0]
            next_Q1_values = not_done_mask * next_max_q1
            next_Q2_values = not_done_mask * next_max_q2
            # Compute the target of the current Q values
            target_Q1_values = rew_batch + (gamma * next_Q1_values)
            target_Q2_values = rew_batch + (gamma * next_Q2_values)
            # Compute Bellman error
            bellman_error1 = target_Q1_values.unsqueeze(1) - current_Q1_values
            bellman_error2 = target_Q2_values.unsqueeze(1) - current_Q2_values
            bellman_error = bellman_error1 + bellman_error2
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            current_Q_values = current_Q1_values + current_Q2_values
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
Exemple #3
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    Q = q_func(input_arg, num_actions)
    Q_target = q_func(input_arg, num_actions)

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()
        if (t > learning_starts):
            action = select_epilson_greedy_action(Q, encoded_obs, t)
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)
        if (done):
            last_obs = env.reset()
        else:
            last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            #
            # Alpha (learning rate) from the q function update isn't present in our code -- its in OptimizerSpec in main.
            # Move to GPU if possible
            # done flag in loop   ---- SKIPPED IF DONE IS TRUE
            # clipping the error between -1 and 1   -- OK
            # backward the error meaning?
            # Suggestion for changing parameters - change exploration scehdule (main)
            #
            # Q.cuda()
            obs_batch, act_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size=batch_size)
            states = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            actions = Variable(torch.from_numpy(act_batch).long())
            rewards = Variable(torch.from_numpy(reward_batch).float())
            next_states = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_dones = Variable(torch.from_numpy(1 - done_mask).type(dtype))
            if USE_CUDA:
                states = states.cuda()
                actions = actions.cuda()
                rewards = rewards.cuda()
                next_states = next_states.cuda()
            Q.train()
            Q_target.eval()
            predicted_rewards = Q(states).gather(1,
                                                 actions.unsqueeze(1))  #Q(s,a)
            next_max_Q = Q_target(next_states).detach().max(1)[
                0]  #.unsqueeze(1) #Q_target(s,a)
            next_Q_values = not_dones * next_max_Q
            target_Q_values = rewards + (gamma * next_Q_values)  #r + Q_target
            bellman_error = target_Q_values - predicted_rewards.squeeze(1)
            clipped_bellman_error = bellman_error.clamp(-1, 1) * (-1.0)
            optimizer.zero_grad()
            predicted_rewards.backward(clipped_bellman_error.data.unsqueeze(1))
            optimizer.step()
            num_param_updates += +1
            if (num_param_updates % target_update_freq == 0):
                Q_target.load_state_dict(Q.state_dict())

            # for obs,act,reward,next_obs,done in zip(obs_batch,act_batch,reward_batch,next_obs_batch,done_mask):
            #     if(done == 1.0):
            #         continue
            #     obs = Variable(torch.from_numpy(obs, ).type(dtype).unsqueeze(0) / 255.0, requires_grad=True)
            #     next_obs = Variable(torch.from_numpy(next_obs).type(dtype).unsqueeze(0) / 255.0, requires_grad=False)
            #     current_Q = Q(obs)
            #     predicted_reward = Variable(current_Q[0][act].unsqueeze(0), requires_grad=True)
            #     target_reward = Q_target(next_obs).data.max(1)[0]
            #     loss = loss_fn(reward + gamma * target_reward, predicted_reward).clamp(-1, 1) * (-1.0)

            #     optimizer.zero_grad()
            #     # should be current.backward(d_error.data.unsqueeze(1))
            #     # but it crashes on misfitting dims
            #     predicted_reward.backward(loss.data.unsqueeze(1))

            #     optimizer.step()
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Exemple #4
0
def dqn_learing(
    env,
    q_func,
    checkpoint_path,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space)      == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. dont save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].view(1,1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    # optionally resume from a checkpoint
    if checkpoint_path:
        if os.path.isfile(checkpoint_path):
            print("=> loading checkpoint '{}'".format(checkpoint_path))
            checkpoint = torch.load(checkpoint_path)
            Q.load_state_dict(checkpoint['model_state_dict'])
            target_Q.load_state_dict(checkpoint['target_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}')".format(checkpoint_path))
        else:
            print("=> no checkpoint found at '{}'".format(checkpoint_path))



    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_EVERY_N_STEPS = 1000
    episode_reward = 0
    episode_rewards = []

    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0][0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        print("reward: %f" % reward)
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            episode_reward = 0
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze() # squeeze the [batch_size x 1] Tensor to have a shape of batch_size
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)

#            # Compute Bellman error
#            bellman_error = target_Q_values - current_Q_values
#            # clip the bellman error between [-1 , 1]
#            clipped_bellman_error = bellman_error.clamp(-1, 1)
#            # Note: clipped_bellman_delta * -1 will be right gradient
#            d_error = clipped_bellman_error * -1.0

            # Compute Huber loss. Why not MSE? Because, Huber Loss is robust to noisy Q estimates compared to plain MSE.
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)

            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
#            current_Q_values.backward(d_error.data.unsqueeze(1))

            loss.backward()
            # Clip the gradients to lie between -1 and +1
            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)
            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_reward += reward
#        episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
        episode_rewards.append(episode_reward)
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t,))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')

        ### 5. Save a checkpoint
        if t % SAVE_EVERY_N_STEPS == 0 and t > learning_starts:
            save_checkpoint({
                'epoch': t + 1,
                'model_state_dict': Q.state_dict(),
                'target_state_dict': target_Q.state_dict(),
                'optimizer' : optimizer.state_dict(),
            }, "checkpoints/checkpoint.%d.tar" % t)
Exemple #5
0
def dqn_learning(
        env,
        method,
        game,
        q_func,
        optimizer_spec,
        exploration,
        stopping_criterion=None,
        replay_buffer_size=1000000,
        batch_size=32,
        gamma=0.99,
        learning_starts=50000,
        learning_freq=4,
        frame_history_len=4,
        target_update_freq=10000,
        double=False,
        dueling=False,
        logdir=None,
        svrl=False,
        me_type=None,
        maskp=None,
        maskstep=None,
        maskscheduler=True
    ):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    def select_epsilon_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].view(1, 1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    #   RUN ENV   #
    ###############

    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    SAVE_MODEL_EVERY_N_STEPS = 1000000
    mask_scheduler_step = (1 - maskp) / maskstep

    for t in count():
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ################
        # STEP THE ENV #
        ################

        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epsilon_greedy_action(Q, recent_observations, t)[0][0]
        else:
            action = random.randrange(num_actions)

        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)

        if done:
            obs = env.reset()
        last_obs = obs

        ################
        #   TRAINING   #
        ################

        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):

            # mask scheduler
            if maskscheduler:
                maskp = min(maskp + mask_scheduler_step, 1)

            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)

            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1)).squeeze()
            target_q_mat = target_Q(next_obs_batch).detach()

            # SV-RL scheme
            if svrl:
                target_q_mat = globals()[me_type](target_q_mat, target_q_mat.size(0), target_q_mat.size(1), maskp)

            if not double:
                next_max_q = target_q_mat.max(1)[0]
            else:
                q_temp = Q(next_obs_batch).detach()
                act_temp = np.argmax(q_temp.cpu(), axis=1)
                next_max_q = torch.sum(torch.from_numpy(np.eye(num_actions)[act_temp]).type(dtype) * target_q_mat.type(dtype), dim=1)

            next_Q_values = not_done_mask * next_max_q.type(dtype)
            target_Q_values = rew_batch + (gamma * next_Q_values)

            loss = F.smooth_l1_loss(current_Q_values, target_Q_values)

            optimizer.zero_grad()
            loss.backward()

            for params in Q.parameters():
                params.grad.data.clamp_(-1, 1)

            optimizer.step()
            num_param_updates += 1

            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ################
        # LOG PROGRESS #
        ################

        # save model
        if t % SAVE_MODEL_EVERY_N_STEPS == 0:
            if not os.path.exists("models"):
                os.makedirs("models")
            add_str = 'single'
            if double:
                add_str = 'double'
            if dueling:
                add_str = 'dueling'
            model_save_path = 'models/%s_%s_%s.ckpt' % (str(game[:-14]), add_str, method)
            torch.save(Q.state_dict(), model_save_path)

        # log process
        episode_rewards = get_wrapper_by_name(env, "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:

            logz.log_tabular('Timestep', t)
            logz.log_tabular('MeanReward100Episodes', mean_episode_reward)
            logz.log_tabular('BestMeanReward', best_mean_episode_reward)
            logz.log_tabular('Episodes', len(episode_rewards))
            logz.log_tabular('Exploration', exploration.value(t))
            logz.dump_tabular()

            sys.stdout.flush()
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000
    ):

    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.size

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return torch.IntTensor([[model(Variable(obs)).data.max(1)[1].cpu()]])
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])


    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)
    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(), **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    writer = SummaryWriter()

    for t in count():

        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()
        
        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0, 0]
        else:
            action = random.randrange(num_actions)

        # Advance one step
        obs, reward, done = env.step(action)

        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()

        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and
                t % learning_freq == 0 and
                replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(batch_size)

            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))

            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            loss = F.smooth_l1_loss(current_Q_values, target_Q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()
            

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())


        # ### 4. Log progress and keep track of statistics
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward)

        if len(episode_rewards) > 0:
            writer.add_scalar('data/DQN/score', episode_rewards[-1], len(episode_rewards))
            writer.add_scalar('data/DQN/mean_score', mean_episode_reward, len(episode_rewards))
            if len(episode_rewards) > 100:
                writer.add_scalar('data/DQN/best_mean_score', best_mean_episode_reward, len(episode_rewards))

                #LOG 저장
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t,))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()
            torch.save(Q, 'DQN_net1029.pt')

    
    writer.close()
Exemple #7
0
    obs, reward, done, _ = env.step(action)
    # clip rewards between -1 and 1
    reward = max(-1.0, min(reward, 1.0))
    # Store other info in replay memory
    replay_buffer.store_effect(last_idx, action, reward, done)
    # Resets the environment when reaching an episode boundary.
    if done:
        obs = env.reset()
    last_obs = obs

    ### Perform experience replay and train the network.
    # Note that this is only done if the replay buffer contains enough samples
    # for us to learn something useful -- until then, the model will not be
    # initialized and random actions should be taken
    if (t > learning_starts and t % learning_freq == 0
            and replay_buffer.can_sample(batch_size)):
        # Use the replay buffer to sample a batch of transitions
        # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
        # in which case there is no Q-value at the next state; at the end of an
        # episode, only the current state reward contributes to the target
        obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
            batch_size)
        # Convert numpy nd_array to torch variables for calculation
        obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) / 255.0)
        act_batch = Variable(torch.from_numpy(act_batch).long())
        rew_batch = Variable(torch.from_numpy(rew_batch))
        next_obs_batch = Variable(
            torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
        not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype)

        if USE_CUDA:
Exemple #8
0
def dqn_learing(
        #env,
        q_func,
        optimizer_spec,
        exploration,
        #stopping_criterion=None,
        replay_buffer_size=1000,
        batch_size=32,
        gamma=0.99,
        learning_starts=1,
        learning_freq=4,
        frame_history_len=1,
        target_update_freq=10000):

    #our own code
    read_image()
    rgb_data = depth_data.reshape(640, 480, 1)
    input_arg = rgb_data
    #input for the algorithm
    num_actions = 5
    last_obs = rgb_data

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0

            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(1, num_actions).type(dtype)
    target_Q = q_func(1, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(1000, 1)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    for t in count():

        last_idx = replay_buffer.store_frame(last_obs)

        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)
        # Advance one step

        control_robot(action + 1)

        rgb_data = depth_data.reshape(640, 480, 1)
        obs = rgb_data
        ##evaluate the action
        dis_data = np.array(depth_data)
        dis_data[np.isnan(dis_data)] = 999999999999
        dis_data[dis_data == 0] = 999999999999
        dis = np.min(dis_data)
        print("MIN DISTANCE:" + str(dis) + "-------------")
        reward = 0
        if dis < 500:
            reward = 1
        else:
            reward = -1
        print("REWARD:" + str(reward) + "--------------")
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, False)
        # Resets the environment when reaching an episode boundary.
        #if done:
        #obs = env.reset()
        last_obs = obs

        if (t > 1 and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            print("Training")
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            current_Q_values = Q(obs_batch).gather(
                1, act_batch.unsqueeze(1)).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            print("next:", next_Q_values.shape)
            print("current:", current_Q_values.squeeze().shape)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error  #.clamp(-1, 1)
            #print(clipped_bellman_error)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            #print(d_error.data)
            optimizer.zero_grad()
            # run backward pass
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own conv-net using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of choseing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        print(env.observation_space.shape)
        img_h, img_w, img_c = env.observation_space.shape
        # input_arg = frame_history_len * img_c
        input_arg = frame_history_len * 1
    num_actions = env.action_space.n
    print(env.action_space)
    print(f"({input_arg}): ({img_h}X{img_w}X{img_c})")

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                values = model(Variable(obs))
            return values.data.max(1)[1].cpu().unsqueeze(dim=1)
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    obs = cv.cvtColor(last_obs, cv.COLOR_BGR2GRAY)
    obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2))
    last_obs = obs[..., np.newaxis]
    print("Q model:")
    summary(Q, input_size=(input_arg, last_obs.shape[0], last_obs.shape[1]))
    print("Q-TARGET model:")
    summary(target_Q,
            input_size=(input_arg, last_obs.shape[0], last_obs.shape[1]))
    LOG_EVERY_N_STEPS = 10000

    rewards = 0.
    out_count = 0
    for t in count():
        ### Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break
        if t % 1e3 == 0:
            if out_count == 0:
                stdout.write("|")
                out_count += 1
            elif out_count % 10 == 0:
                stdout.write(f"{out_count}|")
                out_count += 1
            elif out_count >= 50:
                stdout.write("=> \n")
                out_count = 0
            else:
                stdout.write(".")
                out_count += 1
            stdout.flush()
        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done
        last_idx = replay_buffer.store_frame(last_obs)
        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            values = select_epilson_greedy_action(Q, recent_observations, t)
            action = values[0, 0]
        else:
            action = random.randrange(num_actions)
        # Advance one step
        obs, reward, done, _ = env.step(action)
        rewards += reward
        # clip rewards between -1 and 1
        reward = max(-1.0, min(reward, 1.0))
        # Store other info in replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)
        # Resets the environment when reaching an episode boundary.
        if done:
            obs = env.reset()
            print(len(episode_rewards), episode_rewards, rewards)
            rewards = 0.
        # print(obs.shape)
        # cv.imshow('now_color', obs)
        # cv.waitKey(1)
        obs = cv.cvtColor(obs, cv.COLOR_BGR2GRAY)
        obs = cv.resize(obs, dsize=(obs.shape[1] // 2, obs.shape[0] // 2))
        obs = obs[..., np.newaxis]
        # cv.imshow('now', obs)
        # cv.waitKey(1)
        last_obs = obs
        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            # Convert numpy nd_array to torch variables for calculation
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)
            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            values = Q(obs_batch)
            current_Q_values = values.gather(1,
                                             act_batch.unsqueeze(1)).squeeze()
            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            target_Q_values = rew_batch + (gamma * next_Q_values)
            # Compute Bellman error
            bellman_error = target_Q_values - current_Q_values
            # clip the bellman error between [-1 , 1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            # Note: clipped_bellman_delta * -1 will be right gradient
            d_error = clipped_bellman_error * -1.0
            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            # current_Q_values.backward(d_error.data.unsqueeze(1))
            current_Q_values.backward(d_error.data)

            # Perfom the update
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Exemple #10
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000,
                statistics_file_name="statistics.pkl"):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network

    statistics_file_name: str
        Where to store the statistics file
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    print("STATISTICS_FILE_NAME: {}".format(statistics_file_name))

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(
                torch_types.FloatTensor).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return random.randrange(num_actions)

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE
    policy_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q
    target_net = q_func(input_arg, num_actions).to(device).type(
        torch_types.FloatTensor)  # Q target
    target_net.load_state_dict(
        policy_net.state_dict())  # copies the state of policy Q into target

    ######

    # Construct policy_net network optimizer function
    optimizer = optimizer_spec.constructor(policy_net.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        # YOUR CODE HERE
        stored_frame_idx = replay_buffer.store_frame(last_obs)
        last_obs_encoded = replay_buffer.encode_recent_observation()
        action = select_epilson_greedy_action(policy_net, last_obs_encoded, t)

        obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(stored_frame_idx, action, reward, done)

        if done:
            obs = env.reset()

        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            sample = replay_buffer.sample(batch_size)
            obs_batch, actions_batch, rewards_batch, next_obs_batch, done_mask = sample

            # convert batches to pytorch tensors:
            obs_batch = torch.from_numpy(obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            next_obs_batch = torch.from_numpy(next_obs_batch).to(device).type(
                torch_types.FloatTensor) / 255.0
            actions_batch = torch.from_numpy(actions_batch).to(device).type(
                torch_types.LongTensor)
            rewards_batch = torch.from_numpy(rewards_batch).to(device).type(
                torch_types.FloatTensor)
            non_final_mask = 1 - torch.from_numpy(done_mask).to(device).type(
                torch_types.FloatTensor)

            # inspired by https://pytorch.org/tutorials/intermediate/reinforcement_q_learning.html:

            # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
            # columns of actions taken
            state_action_values = policy_net(obs_batch).gather(
                1, actions_batch.unsqueeze(1)).squeeze(1)

            # Compute V(s_{t+1}) for all next states.
            next_state_values = target_net(next_obs_batch).max(
                1)[0].detach() * non_final_mask
            # Compute the expected Q values
            expected_state_action_values = (next_state_values *
                                            gamma) + rewards_batch

            # Compute loss
            d_error = state_action_values - expected_state_action_values  # = -bellman_error
            d_error.clamp_(-1, 1)

            # Optimize the model
            optimizer.zero_grad()
            state_action_values.backward(d_error)
            optimizer.step()

            num_param_updates += 1
            # Periodically update target network:
            if num_param_updates % target_update_freq == 0:
                target_net.load_state_dict(policy_net.state_dict())
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t >= learning_starts:
            print("Timestep %d" % (t, ))
            print("  mean reward (100 episodes) %f" % mean_episode_reward)
            print("  best mean reward %f" % best_mean_episode_reward)
            print("  episodes %d" % len(episode_rewards))
            print("  exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open(statistics_file_name, 'wb') as f:
                pickle.dump(Statistic, f)
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    if not os.path.isdir("./models"):
        os.mkdir("./models")

    if len(env.observation_space.shape) == 1:
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                ret = model(obs).data.max(1)[1].cpu()
                return ret
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    save_best_mean_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 20000
    SAVE_EVERY_N_STEPS = 2000000
    AL_ALPHA = 0.7

    for t in count():
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### Step the env and store the transition
        last_idx = replay_buffer.store_frame(last_obs)
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0]
        else:
            action = random.randrange(num_actions)
        obs, reward, done, _ = env.step(action)
        reward = max(-1.0, min(reward, 1.0))
        replay_buffer.store_effect(last_idx, action, reward, done)
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            cur_all_Q_values = Q(obs_batch)
            action_gap = cur_all_Q_values.max(
                dim=1)[0] * cur_all_Q_values.size(1) - cur_all_Q_values.sum(
                    dim=1)
            Statistic["mean_action_gap"].append(action_gap.mean().item())

            current_Q_values = cur_all_Q_values.gather(
                1, act_batch.unsqueeze(1)).squeeze()
            next_target_Q_values = target_Q(next_obs_batch).detach()
            next_max_q = next_target_Q_values.max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            target_Q_values = rew_batch + (gamma * next_Q_values)
            bellman_error = target_Q_values - current_Q_values

            cur_target_Q_values = target_Q(obs_batch).detach()

            cur_advantage = cur_target_Q_values.max(
                dim=1)[0] - cur_target_Q_values.gather(
                    1, act_batch.unsqueeze(1)).squeeze()
            next_advantage = next_target_Q_values.max(
                dim=1)[0] - next_target_Q_values.gather(
                    1, act_batch.unsqueeze(1)).squeeze()

            # Set up the error according to the operator you want
            al_error = bellman_error - AL_ALPHA * cur_advantage
            persistent_error = bellman_error - AL_ALPHA * next_advantage
            pal_error = torch.max(al_error, persistent_error)
            error = pal_error  # use whichever you want

            clipped_bellman_error = error.clamp(-1, 1)
            d_error = clipped_bellman_error * -1.0
            optimizer.zero_grad()
            current_Q_values.backward(d_error.data)

            optimizer.step()
            num_param_updates += 1

            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        ## Log Progress
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % './models/statistics.pkl')

            if save_best_mean_reward < best_mean_episode_reward:
                save_best_mean_reward = best_mean_episode_reward
                torch.save(Q.state_dict(), './models/best_model.pth')

        if t % SAVE_EVERY_N_STEPS == 0:
            torch.save(Q.state_dict(), './models/n_steps_%d.pth' % t)
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    ############################
    # BUILD MODEL / 모델 만들기 #
    ############################

    # Observation 크기에 따라 Q function에 들어갈 input_arg 설정
    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c

    # Simulator에서의 action의 갯수 받아옴
    num_actions = env.action_space.size

    # Construct an epilson greedy policy with given exploration schedule
    # Epsilon greedy 함수 설정
    ## random으로 뽑은 sample값과 dqn learning의 exploration schedule과 비교, 결과에 맞게 epsilon greedy action policy를 줌
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        #
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return torch.IntTensor(
                [[model(Variable(obs)).data.max(1)[1].cpu()]])
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function
    # Q function과 target Q function 생성
    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    # Construct Q network optimizer function
    # Optimizer_spec을 이용하여 optimizer function을 만듦
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    # ReplayBuffer을 이용해 replay_buffer 생성
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    #초기값 설정
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000

    # TensorboardX를 모니터함
    writer = SummaryWriter()

    # t가 0부터 loop이 돌 때 마다 하나씩 커짐. 몇 번의 iteration이 실행되었는지 확인 가능
    for t in count():
        ### Step the env and store the transition
        # Store lastest observation in replay memory and last_idx can be used to store action, reward, done

        # 가장 최근의 결과 이미지가 replay_buffer에 저장되고 그때의 action, reward, 그리고 끝남의 여부가 last_idx에 저장됨
        last_idx = replay_buffer.store_frame(last_obs)

        # encode_recent_observation will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.

        #replay_buffer에 저장된 buffer중 가장 최근 것을 불러 직전의 frame들과 비교, Q network에 들어갈 input을 계산
        recent_observations = replay_buffer.encode_recent_observation()

        # Choose random action if not yet start learning
        # t가 learning_starts보다 크다면, 즉 충분한 iteration이 진행되었다면 action을 random값이 아닌 learning에 의한 값으로 받음
        if t > learning_starts:
            action = select_epilson_greedy_action(Q, recent_observations, t)[0,
                                                                             0]
        else:
            action = random.randrange(num_actions)

        # Advance one step
        # action을 취하고 그에 따른 결과 이미지 (obs), 보상 (reward), 끝남 여부 (done)을 저장, replay_buffer에도 넣어줌
        obs, reward, done = env.step(action)
        replay_buffer.store_effect(last_idx, action, reward, done)

        # Resets the environment when reaching an episode boundary.
        # 끝이 났다면 env, 즉 학습 환경도 다시 리셋함
        if done:
            obs = env.reset()
        last_obs = obs

        ### Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken

        ## 충분한 iteration으로 t가 learning_starts보다 크고,
        ## learning_freq의 주기와 맞고,
        ## buffer의 사이즈가 batch 사이즈와 비교해 충분 할 때, learning이 시작됨
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Use the replay buffer to sample a batch of transitions
            # Note: done_mask[i] is 1 if the next state corresponds to the end of an episode,
            # in which case there is no Q-value at the next state; at the end of an
            # episode, only the current state reward contributes to the target

            #replay_buffer에서 batch size에 맞는 데이터 양을 불러온다
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)

            # Convert numpy nd_array to torch variables for calculation

            # model의 input에 맞게 numpy array에서 torch로 변환
            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype) / 255.0)
            act_batch = Variable(torch.from_numpy(act_batch).long())
            rew_batch = Variable(torch.from_numpy(rew_batch))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.0)
            not_done_mask = Variable(torch.from_numpy(1 -
                                                      done_mask)).type(dtype)

            if USE_CUDA:
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()

            # Compute current Q value, q_func takes only state and output value for every state-action pair
            # We choose Q based on action taken.
            # 현재의 Q value를 계산
            current_Q_values = Q(obs_batch).gather(1, act_batch.unsqueeze(1))

            # Compute next Q value based on which action gives max Q values
            # Detach variable from the current graph since we don't want gradients for next Q to propagated
            # 어떤 action이 max Q value를 주는지에 따라 다음 Q value를 설정
            next_max_q = target_Q(next_obs_batch).detach().max(1)[0]
            next_Q_values = not_done_mask * next_max_q
            # Compute the target of the current Q values
            # 현재의 targer Q value를 optimize와 backward를 이용하여 계산
            target_Q_values = rew_batch + (gamma * next_Q_values)
            loss = F.smooth_l1_loss(current_Q_values,
                                    target_Q_values.unsqueeze(1))
            optimizer.zero_grad()
            loss.backward()

            # Perfom the update
            # 업데이트 후 업데이트 횟수도 업데이트
            optimizer.step()
            num_param_updates += 1

            # Periodically update the target network by Q network to target Q network
            # targer 업데이트 주기에 맞을때 마다 target network를 업데이트
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())

        # ### 4. Log progress and keep track of statistics
        # episode reward 출력, 100번이 넘어가면 최고평균값도 평균값과 함께 출력
        episode_rewards = env.get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        # Tensorboard에 저장
        if len(episode_rewards) > 0:
            writer.add_scalar('data/DQN/score', episode_rewards[-1],
                              len(episode_rewards))
            writer.add_scalar('data/DQN/mean_score', mean_episode_reward,
                              len(episode_rewards))
            if len(episode_rewards) > 100:
                writer.add_scalar('data/DQN/best_mean_score',
                                  best_mean_episode_reward,
                                  len(episode_rewards))

        # learning된 내용 출력
        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()
            torch.save(Q, 'DQN_net1029.pt')
            ## 적당한 파일에 저장

    writer.close()
Exemple #13
0
def dqn_learn(env, q_func, optimizer_spec, exploration, stopping_criterion,
              replay_buffer_size, batch_size, gamma, learning_starts,
              learning_freq, frame_history_len, target_update_freq,
              grad_norm_clipping, double_q):
    """Implements DQN training
    
    Parameters
    ----------
    env : gym.Env
        OpenAI gym environment
    q_func : torch.nn.Module
        DQN that computes q-values for each action: (state) -> (q-value, action)
    optimizer_spec : OptimizerSpec
        parameters for the optimizer
    exploration : Schedule
        schedule for epsilon-greedy exploration
    stopping_criterion : func
        when to stop training: (env, num_timesteps) -> bool
    replay_buffer_size : int
        experience replay memory size
    batch_size : int
        batch size to sample from replay memory
    gamma : float
        discount factor
    learning_starts : int
        number of environment steps before starting the training process
    learning_freq : int
        number of environment steps between updating DQN weights
    frame_history_len : int
        number of previous frames to include as DQN input
    target_update_freq : int
        number of experience replay steps to update the target network
    grad_norm_clipping : float
        maximum size of gradients to clip to
    double_q : bool
        enable double DQN learning
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    def select_action(dqn, obs, t):
        """Implements epsilon-greedy exploration
        
        Parameters
        ----------
        dqn : torch.nn.Module
            DQN model
        obs : np.ndarray
            Stacked input frames to evaluate
        t : int
            Current time step
        
        Returns
        -------
        nd.array (1,1)
            action to take
        """
        threshold = exploration.value(t)
        if random.random() > threshold:
            # take optimal action
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # DQN returns (q-value, action)
            q_values = dqn(obs)
            # returns (max, argmax) of q-values (max q-value, action which produces max q-value)
            _, action = q_values.data.max(1)
        else:
            # take a random action
            action = torch.IntTensor([random.randrange(num_actions)])
        return action

    # get input sizes and num actions
    img_h, img_w, img_c = env.observation_space.shape
    in_channels = frame_history_len * img_c
    input_shape = (img_h, img_w, in_channels)
    num_actions = env.action_space.n

    # construct online and target DQNs
    online_DQN = q_func(in_channels=in_channels, num_actions=num_actions)
    target_DQN = q_func(in_channels=in_channels, num_actions=num_actions)

    # construct optimizer
    optimizer = optimizer_spec.constructor(online_DQN.parameters(),
                                           **optimizer_spec.kwargs)

    # construct replay memory
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    # initialize main loop variables
    num_param_updates = 0
    avg_episode_reward = float('-inf')
    best_avg_episode_reward = float('-inf')
    cumulative_avg_episode_reward = float('-inf')
    prev_obs = env.reset()

    # main training loop
    for t in count():
        # check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env, t):
            break

        # store transition and concatenate last frames
        last_idx = replay_buffer.store_frame(prev_obs)

        # stack previous frames into a tensor to give to DQN
        stacked_obs = replay_buffer.encode_recent_observation()

        # take random actions until we've officially started training
        if t > learning_starts:
            # select action according to epsilon-greedy
            action = select_action(online_DQN, stacked_obs, t)[0]
        else:
            # take a random action
            action = random.randrange(num_actions)

        # step environment
        obs, reward, done, _ = env.step(action)
        # clip reward
        reward = min(-1.0, max(reward, 1.0))
        # store effect of taking action in prev_obs into replay memory
        replay_buffer.store_effect(last_idx, action, reward, done)

        # if game is finished, reset environment
        if done:
            obs = env.reset()
        prev_obs = obs

        # experience replay
        if t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(
                batch_size):

            # sample batches
            obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = torch.from_numpy(obs_batch).type(dtype) / 255.0
            action_batch = torch.from_numpy(action_batch).long()
            reward_batch = torch.from_numpy(reward_batch)
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0
            not_done_mask = torch.from_numpy(1 - done_mask).type(dtype)

            if torch.cuda.is_available():
                action_batch = action_batch.cuda()
                reward_batch = reward_batch.cuda()

            # Compute current q-values: Q(s, a)
            # Select q-values based on actions we would have taken for each state
            # shape: (BATCH_SIZE, 1)
            current_q_values = online_DQN(obs_batch).gather(
                1, action_batch.unsqueeze(1))

            # double DQN or vanilla DQN
            if double_q:
                # compute which actions to take according to online network: argmax_a Q(s', a)
                greedy_actions = online_DQN(next_obs_batch).detach().max(1)[1]
                # compute q-values of those actions using target network: Q_hat(s', argmax_a Q(s', a))
                next_q_values = target_DQN(next_obs_batch).gather(
                    1, greedy_actions.unsqueeze(1))
            else:
                # Compute next q-values using target network
                next_q_values = target_DQN(next_obs_batch).detach().max(1)[0]
                next_q_values = next_q_values.unsqueeze(1)

            # apply mask to retain q-values
            next_q_values = not_done_mask.unsqueeze(1) * next_q_values
            """
            Compute the target q-values (BATCH_SIZE, 1)
            y_j = r_j + gamma * max_a' Q(s', a')                for vanilla DQN
            y_j = r_j + gamma * Q_hat(s', argmax_a Q(s', a))    for double DQN
            """
            target_q_values = reward_batch + (gamma * next_q_values)
            """
            Use the huber loss instead of clipping the TD error.
            Huber loss intuitively means we assign a much larger loss where the error is large (quadratic)
            Smaller errors equate to smaller losses (linear)
            """
            loss = F.smooth_l1_loss(current_q_values, target_q_values)

            # Clear previous gradients before backward pass
            optimizer.zero_grad()
            # run backward pass
            loss.backward()

            # clip gradients
            nn.utils.clip_grad_norm_(online_DQN.parameters(),
                                     grad_norm_clipping)

            # update weights of dqn
            optimizer.step()
            num_param_updates += 1

            # update target network weights
            if num_param_updates % target_update_freq == 0:
                target_DQN.load_state_dict(online_DQN.state_dict())

        # end experience replay

        # log progress so far by averaging last 100 episodes
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            avg_episode_reward = np.mean(episode_rewards[-100:])
            cumulative_avg_episode_reward = np.mean(episode_rewards)
        if len(episode_rewards) > 100:
            best_avg_episode_reward = max(best_avg_episode_reward,
                                          avg_episode_reward)

        if t % LOG_FREQ == 0 and t > learning_starts:
            print('-' * 64)
            print('Timestep {}'.format(t))
            print(
                'Average reward (100 episodes): {}'.format(avg_episode_reward))
            print('Best average reward: {}'.format(best_avg_episode_reward))
            print('Cumulative average reward: {}'.format(
                cumulative_avg_episode_reward))
            print('Episode {}'.format(len(episode_rewards)))
            print('Exploration {}'.format(exploration.value(t)))
            print('\n')
            sys.stdout.flush()
Exemple #14
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):

    print("running new version")
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """

    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            return model(Variable(obs, volatile=True)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    """ ---------------------------- OUR CODE ---------------------------- """
    Q = q_func(input_arg, num_actions)  # The parameters are random
    Qtag = q_func(input_arg, num_actions)
    if (USE_CUDA):
        Q.cuda()
        Qtag.cuda()
    Qtag.load_state_dict(Q.state_dict())

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)
    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)
    """ ------------------------------------------------------------------ """

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    reward = None
    done = None
    info = None
    LOG_EVERY_N_STEPS = 10000

    startTime = time.time()

    for t in count():
        """ Tsuf: ---- Stuff for debigging times for various places --- """
        T1 = 0
        t1Tmp = 0
        T2 = 0
        t2Tmp = 0
        T3 = 0
        t3Tmp = 0
        T4 = 0
        t4Tmp = 0
        T5 = 0
        t5Tmp = 0
        T6 = 0
        t6Tmp = 0
        T7 = 0
        t7Tmp = 0
        T8 = 0
        t8Tmp = 0
        """ ----------------------------------------------------------- """
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

#if (t>1000000):
#    break
### 2. Step the env and store the transition

# At this point, "last_obs" contains the latest observation that was
# recorded from the simulator. Here, your code needs to store this
# observation and its outcome (reward, next observation, etc.) into
# the replay buffer while stepping the simulator forward one step.
# At the end of this block of code, the simulator should have been
# advanced one step, and the replay buffer should contain one more
# transition.
# Specifically, last_obs must point to the new latest observation.
# Useful functions you'll need to call:
# obs, reward, done, info = env.step(action)
# this steps the environment forward one step
# obs = env.reset()
# this resets the environment if you reached an episode boundary.
# Don't forget to call env.reset() to get a new observation if done
# is true!!
# Note that you cannot use "last_obs" directly as input
# into your network, since it needs to be processed to include context
# from previous frames. You should check out the replay buffer
# implementation in dqn_utils.py to see what functionality the replay
# buffer exposes. The replay buffer has a function called
# encode_recent_observation that will take the latest observation
# that you pushed into the buffer and compute the corresponding
# input that should be given to a Q network by appending some
# previous frames.
# Don't forget to include epsilon greedy exploration!
# And remember that the first time you enter this loop, the model
# may not yet have been initialized (but of course, the first step
# might as well be random, since you haven't trained your net...)
        """ -------------------------- OUR CODE -------------------------- """

        #store last_obs, and get latest obs's as the input for the n.n
        t1Tmp = time.time()
        cur_idx = replay_buffer.store_frame(last_obs)
        next_input = replay_buffer.encode_recent_observation()
        T1 += time.time() - t1Tmp

        #take random action or use the net
        t2Tmp = time.time()
        action = select_epilson_greedy_action(
            Q, next_input, t)  #the returned action is on the CPU
        T2 += time.time() - t2Tmp

        #see what happens after we take that action
        t3Tmp = time.time()
        last_obs, reward, done, info = env.step(
            action)  #the returned parameters are on the CPU
        T3 += time.time() - t3Tmp

        #     print(t)
        # env.render()
        #store the results on the replay buffer
        replay_buffer.store_effect(cur_idx, action, reward, done)  #on the CPU

        #if the simulation is done, reset the environment
        if (done):
            last_obs = env.reset()
        """ -------------------------------------------------------------- """

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            """ ------------------------ OUR CODE ------------------------ """

            #sample a batch of history samples
            t4Tmp = time.time()
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)  #on CPU

            obs_batch = torch.from_numpy(obs_batch).type(
                dtype) / 255.0  # When available, move the samples batch to GPU
            next_obs_batch = torch.from_numpy(next_obs_batch).type(
                dtype) / 255.0  #GPU
            T4 += time.time() - t4Tmp

            #see which Q values the current network gives, for all obs's
            t5Tmp = time.time()
            inter_Qs = Q(
                Variable(obs_batch))  #input is on GPU, output is on GPU
            inter_Qs_chosen = Variable(
                torch.zeros(batch_size).type(dtype))  #GPU
            #take the action that was chosen before
            for i in range(batch_size):
                inter_Qs_chosen[i] = inter_Qs[i, act_batch[i]]
            #take only the intermediate (non-terminal) obs's
            inter_idx = np.where(done_mask == False)[0]  #CPU
            inter_next_obs_batch = next_obs_batch[inter_idx, :, :, :]
            T5 += time.time() - t5Tmp

            #see what the "target" (backuped) network says for the intermediate ones
            t6Tmp = time.time()
            inter_next_Qs = Qtag(
                Variable(inter_next_obs_batch,
                         volatile=True)).data.max(1)[0]  #All on GPU
            T6 += time.time() - t6Tmp

            #calculate the bellman errors
            t7Tmp = time.time()
            #for final obs's, the target is just the reward
            targets = torch.from_numpy(rew_batch).type(
                dtype)  #Moved rew_batch to GPU (as 'targets')
            for (i, idx) in enumerate(inter_idx):
                targets[idx] += gamma * inter_next_Qs[i]  #The bellman item
            # errors = -(inter_Qs_chosen.data - targets)**2 #EQUATION COULD BE WRONG!!   [on GPU]
            # for i in range(len(errors)):
            #     if errors[i]<-1:
            #         errors[i] = -1
            #     elif errors[i]>1:
            #         errors[i] = 1
            errors = inter_Qs_chosen.data - targets
            errors.clamp(-1, 1)
            T7 += time.time() - t7Tmp

            #train the network! (:
            t8Tmp = time.time()
            optimizer.zero_grad()
            inter_Qs_chosen.backward(
                errors)  #COULD BE WRONG WAY!!    [Everything is on GPU (: ]
            optimizer.step()
            T8 += time.time() - t8Tmp

            num_param_updates += 1
            if (num_param_updates % target_update_freq == 0):
                Qtag.load_state_dict(Q.state_dict())
            """ ---------------------------------------------------------- """

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)
        Statistic["running_times"].append(int(time.time() - startTime))

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            if (PRINT_TIMES):
                print("-----------------------")
                print(T1)
                print(T2)
                print(T3)
                print(T4)
                print(T5)
                print(T6)
                print(T7)
                print(T8)
                print("-----------------------")
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            with open('statistics.pkl', 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % 'statistics.pkl')
Exemple #15
0
def dqn_learing(
    env,
    q_func,
    optimizer_spec,
    exploration,
    feature_tested,
    stopping_criterion=None,
    replay_buffer_size=1000000,
    batch_size=32,
    gamma=0.99,
    learning_starts=50000,
    learning_freq=4,
    frame_history_len=4,
    target_update_freq=10000,
):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    # added bool_flag for double save statistic
    bool_flag = False
    STATS_FILE_NAME = 'statistics ' + feature_tested + '.pkl'

    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            with torch.no_grad():
                # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    # YOUR CODE HERE

    if USE_CUDA:
        Q = q_func(num_actions=num_actions).cuda()
        Q_target = q_func(num_actions=num_actions).cuda()
    else:
        Q = q_func(num_actions=num_actions)
        Q_target = q_func(num_actions=num_actions)

    Q_target.load_state_dict(Q.state_dict())

    # Check & load pretrained model
    if os.path.isfile('Q_params' + feature_tested + '.pkl'):
        print('Load Q parameters ...')
        Q.load_state_dict(torch.load('Q_params' + feature_tested + '.pkl'))

    if os.path.isfile('target_Q_params' + feature_tested + '.pkl'):
        print('Load target Q parameters ...')
        Q_target.load_state_dict(
            torch.load('target_Q_params' + feature_tested + '.pkl'))

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    Statistic = {
        "starting_Q_values": [],
        "mean_episode_rewards": [],
        "best_mean_episode_rewards": []
    }

    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')

    # load prev Stats
    start = 0
    if os.path.isfile(STATS_FILE_NAME):
        with open(STATS_FILE_NAME, 'rb') as f:
            Statistic = pickle.load(f)
            mean_episode_reward = Statistic["mean_episode_rewards"][-1]
            best_mean_episode_reward = Statistic["best_mean_episode_rewards"][
                -1]
            start = len(Statistic["mean_episode_rewards"])
            print('Load %s ...' % STATS_FILE_NAME)
    done = False

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 50000  #10000

    for t in count(start):
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            return Statistic
            break
        # couldnt handle stopping_criterion, this works:
        if t > 4500000:
            return Statistic
            break
        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        # YOUR CODE HERE
        idx = replay_buffer.store_frame(last_obs)
        encoded_obs = replay_buffer.encode_recent_observation()
        action = select_epilson_greedy_action(Q, encoded_obs, t)
        # if started a new game - store Q-func
        ######
        # done not initialized since last time

        if t > learning_starts and done:
            # a very expensive statistic - so don't log frequently
            with torch.no_grad():
                obs = torch.from_numpy(encoded_obs).type(dtype).unsqueeze(
                    0) / 255.0
                item = torch.max(Q(Variable(obs))).item()
            Statistic["starting_Q_values"].append(item)

        ######
        # this steps the environment forward one step
        last_obs, reward, done, info = env.step(action)
        replay_buffer.store_effect(idx, action, reward, done)

        if done:
            last_obs = env.reset()

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):

            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            # YOUR CODE HERE
            # 3.a sample a batch of transitions
            sample = replay_buffer.sample(batch_size)
            obs_batch, action_batch, reward_batch, next_obs_batch, done_mask = sample

            # move variables to GPU if available

            obs_batch = Variable(
                torch.from_numpy(obs_batch).type(dtype)) / 255.0
            action_batch = Variable(
                torch.from_numpy(action_batch).type(dtype).long().view(-1, 1))
            reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype))
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype)) / 255.0
            done_mask = Variable(torch.from_numpy(done_mask).type(dtype))

            # 3.b compute the Bellman error
            # evaluating the current and next Q-values
            state_action_values = Q(obs_batch).gather(1, action_batch)
            next_state_values = Q_target(next_obs_batch).detach()

            # maskout post terminal status Q-values
            masked_next_state_values = next_state_values.max(1)[0] * (
                1 - done_mask)
            # constructing the corresponding error
            expected_state_action_values = (masked_next_state_values *
                                            gamma) + reward_batch
            bellman_error = expected_state_action_values.unsqueeze(
                1) - state_action_values

            # clip the error between [-1,1]
            clipped_bellman_error = bellman_error.clamp(-1, 1)
            optimizer.zero_grad()
            # multiply by -1 (since pytorch minimizes)
            state_action_values.backward(-clipped_bellman_error)

            # 3.c: train the model
            optimizer.step()

            # 3.d periodically update the target network
            num_param_updates += 1
            if num_param_updates % target_update_freq == 0:
                Q_target.load_state_dict(Q.state_dict())

            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Save the trained model
            torch.save(Q.state_dict(), 'Q_params' + feature_tested + '.pkl')
            torch.save(Q_target.state_dict(),
                       'target_Q_params' + feature_tested + '.pkl')
            # Dump statistics to pickle
            #double save
            if bool_flag:
                bool_flag = False
                with open(STATS_FILE_NAME, 'wb') as f:
                    pickle.dump(Statistic, f)
                    print("Saved to %s" % STATS_FILE_NAME)
            else:
                bool_flag = True
                with open('copy_' + STATS_FILE_NAME, 'wb') as f:
                    pickle.dump(Statistic, f)
                    print("Saved to %s" % 'copy_' + STATS_FILE_NAME)

            plt.clf()
            plt.xlabel('Num of Games')
            plt.ylabel('Q-values on starting state')
            plt.plot(range(len(Statistic["starting_Q_values"])),
                     Statistic["starting_Q_values"],
                     label='Q-values')
            plt.legend()
            plt.title(feature_tested)
            plt.savefig('Q-value-Performance' + feature_tested + '.png')

            plt.clf()
            plt.xlabel('Timesteps')
            plt.ylabel('Mean Reward (past 100 episodes)')
            num_items = len(Statistic["mean_episode_rewards"])
            plt.plot(range(num_items),
                     Statistic["mean_episode_rewards"],
                     label='mean reward')
            plt.plot(range(num_items),
                     Statistic["best_mean_episode_rewards"],
                     label='best mean rewards')
            plt.legend()
            plt.title(feature_tested)
            plt.savefig('DeepQ-Performance' + feature_tested + '.png')
Exemple #16
0
def dqn_learing(env,
                q_func,
                optimizer_spec,
                exploration,
                stopping_criterion=None,
                replay_buffer_size=1000000,
                batch_size=32,
                gamma=0.99,
                learning_starts=50000,
                learning_freq=4,
                frame_history_len=4,
                target_update_freq=10000):
    """Run Deep Q-learning algorithm.

    You can specify your own convnet using q_func.

    All schedules are w.r.t. total number of steps taken in the environment.

    Parameters
    ----------
    env: gym.Env
        gym environment to train on.
    q_func: function
        Model to use for computing the q function. It should accept the
        following named arguments:
            input_channel: int
                number of channel of input.
            num_actions: int
                number of actions
    optimizer_spec: OptimizerSpec
        Specifying the constructor and kwargs, as well as learning rate schedule
        for the optimizer
    exploration: Schedule (defined in utils.schedule)
        schedule for probability of chosing random action.
    stopping_criterion: (env) -> bool
        should return true when it's ok for the RL algorithm to stop.
        takes in env and the number of steps executed so far.
    replay_buffer_size: int
        How many memories to store in the replay buffer.
    batch_size: int
        How many transitions to sample each time experience is replayed.
    gamma: float
        Discount Factor
    learning_starts: int
        After how many environment steps to start replaying experiences
    learning_freq: int
        How many steps of environment to take between every experience replay
    frame_history_len: int
        How many past frames to include as input to the model.
    target_update_freq: int
        How many experience replay rounds (not steps!) to perform between
        each update to the target Q network
    """
    assert type(env.observation_space) == gym.spaces.Box
    assert type(env.action_space) == gym.spaces.Discrete
    Statistic['parameters'] = {
        'replay_buffer_size': replay_buffer_size,
        'batch_size': batch_size,
        'gamma': gamma,
        'frame_history_len': frame_history_len,
        'learning_starts': learning_starts,
        'learning_freq': learning_freq,
        'target_update_freq': target_update_freq,
        'name': env.env.unwrapped.spec.id
    }
    ###############
    # BUILD MODEL #
    ###############

    if len(env.observation_space.shape) == 1:
        # This means we are running on low-dimensional observations (e.g. RAM)
        input_arg = env.observation_space.shape[0]
    else:
        img_h, img_w, img_c = env.observation_space.shape
        input_arg = frame_history_len * img_c
    num_actions = env.action_space.n

    # Construct an epilson greedy policy with given exploration schedule
    def select_epilson_greedy_action(model, obs, t):
        sample = random.random()
        eps_threshold = exploration.value(t)
        if sample > eps_threshold:
            obs = torch.from_numpy(obs).type(dtype).unsqueeze(0) / 255.0
            # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history
            with torch.no_grad():
                return model(Variable(obs)).data.max(1)[1].cpu()
        else:
            return torch.IntTensor([[random.randrange(num_actions)]])

    # Initialize target q function and q function, i.e. build the model.
    ######

    Q = q_func(input_arg, num_actions).type(dtype)
    target_Q = q_func(input_arg, num_actions).type(dtype)

    if USE_CUDA:
        Q = Q.cuda()
        target_Q = target_Q.cuda()

    ######

    # Construct Q network optimizer function
    optimizer = optimizer_spec.constructor(Q.parameters(),
                                           **optimizer_spec.kwargs)

    # Construct the replay buffer
    replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len)

    ###############
    # RUN ENV     #
    ###############
    num_param_updates = 0
    mean_episode_reward = -float('nan')
    best_mean_episode_reward = -float('inf')
    last_obs = env.reset()
    LOG_EVERY_N_STEPS = 10000
    filename = 'statistics.pkl'

    # Google Drive
    try:
        import google.colab
        IN_COLAB = True
    except:
        IN_COLAB = False

    if IN_COLAB:
        run_in_colab_message()
        try:
            from google.colab import auth
            import logging
            from pydrive.auth import GoogleAuth
            from pydrive.drive import GoogleDrive
            from oauth2client.client import GoogleCredentials
            logging.getLogger('googleapicliet.discovery_cache').setLevel(
                logging.ERROR)
            auth.authenticate_user()
            gauth = GoogleAuth()
            gauth.credentials = GoogleCredentials.get_application_default()
            drive = GoogleDrive(gauth)
        except:
            pass

    iter_time = time()

    for t in count():
        ### 1. Check stopping criterion
        if stopping_criterion is not None and stopping_criterion(env):
            break

        ### 2. Step the env and store the transition
        # At this point, "last_obs" contains the latest observation that was
        # recorded from the simulator. Here, your code needs to store this
        # observation and its outcome (reward, next observation, etc.) into
        # the replay buffer while stepping the simulator forward one step.
        # At the end of this block of code, the simulator should have been
        # advanced one step, and the replay buffer should contain one more
        # transition.
        # Specifically, last_obs must point to the new latest observation.
        # Useful functions you'll need to call:
        # obs, reward, done, info = env.step(action)
        # this steps the environment forward one step
        # obs = env.reset()
        # this resets the environment if you reached an episode boundary.
        # Don't forget to call env.reset() to get a new observation if done
        # is true!!
        # Note that you cannot use "last_obs" directly as input
        # into your network, since it needs to be processed to include context
        # from previous frames. You should check out the replay buffer
        # implementation in dqn_utils.py to see what functionality the replay
        # buffer exposes. The replay buffer has a function called
        # encode_recent_observation that will take the latest observation
        # that you pushed into the buffer and compute the corresponding
        # input that should be given to a Q network by appending some
        # previous frames.
        # Don't forget to include epsilon greedy exploration!
        # And remember that the first time you enter this loop, the model
        # may not yet have been initialized (but of course, the first step
        # might as well be random, since you haven't trained your net...)
        #####

        idx = replay_buffer.store_frame(last_obs)
        enc_obs = replay_buffer.encode_recent_observation()

        if t > learning_starts:
            action = select_epilson_greedy_action(Q, enc_obs, t)
        else:
            action = torch.IntTensor([[random.randrange(num_actions)]])

        obs, reward, done, info = env.step(action)
        if done:
            obs = env.reset()

        replay_buffer.store_effect(idx, action, reward, done)

        last_obs = obs

        #####

        # at this point, the environment should have been advanced one step (and
        # reset if done was true), and last_obs should point to the new latest
        # observation

        ### 3. Perform experience replay and train the network.
        # Note that this is only done if the replay buffer contains enough samples
        # for us to learn something useful -- until then, the model will not be
        # initialized and random actions should be taken
        if (t > learning_starts and t % learning_freq == 0
                and replay_buffer.can_sample(batch_size)):
            # Here, you should perform training. Training consists of four steps:
            # 3.a: use the replay buffer to sample a batch of transitions (see the
            # replay buffer code for function definition, each batch that you sample
            # should consist of current observations, current actions, rewards,
            # next observations, and done indicator).
            # Note: Move the variables to the GPU if avialable
            # 3.b: fill in your own code to compute the Bellman error. This requires
            # evaluating the current and next Q-values and constructing the corresponding error.
            # Note: don't forget to clip the error between [-1,1], multiply is by -1 (since pytorch minimizes) and
            #       maskout post terminal status Q-values (see ReplayBuffer code).
            # 3.c: train the model. To do this, use the bellman error you calculated perviously.
            # Pytorch will differentiate this error for you, to backward the error use the following API:
            #       current.backward(d_error.data.unsqueeze(1))
            # Where "current" is the variable holding current Q Values and d_error is the clipped bellman error.
            # Your code should produce one scalar-valued tensor.
            # Note: don't forget to call optimizer.zero_grad() before the backward call and
            #       optimizer.step() after the backward call.
            # 3.d: periodically update the target network by loading the current Q network weights into the
            #      target_Q network. see state_dict() and load_state_dict() methods.
            #      you should update every target_update_freq steps, and you may find the
            #      variable num_param_updates useful for this (it was initialized to 0)
            #####

            #3.a
            obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = replay_buffer.sample(
                batch_size)
            obs_batch = Variable(torch.from_numpy(obs_batch).type(dtype) /
                                 255.,
                                 requires_grad=True)
            act_batch = Variable(torch.from_numpy(act_batch).type(torch.int64))
            rew_batch = Variable(torch.from_numpy(rew_batch).type(dtype),
                                 requires_grad=True)
            next_obs_batch = Variable(
                torch.from_numpy(next_obs_batch).type(dtype) / 255.,
                requires_grad=True)
            done_mask = Variable(torch.from_numpy(done_mask).type(torch.int64))

            if USE_CUDA:
                obs_batch = obs_batch.cuda()
                act_batch = act_batch.cuda()
                rew_batch = rew_batch.cuda()
                next_obs_batch = next_obs_batch.cuda()
                done_mask = done_mask.cuda()

            # Q network
            val = Q(obs_batch).gather(dim=1, index=act_batch.unsqueeze(1))

            # Q target network
            with torch.no_grad():
                tar_val_t = target_Q(next_obs_batch).max(1)[0]
            tar_val = torch.addcmul(rew_batch, gamma,
                                    1 - done_mask.type(dtype), tar_val_t)

            # 3.b error calculate
            d_error = (tar_val - val.squeeze()).clamp_(-1, 1) * -1.
            # d_error = torch.pow((tar_val - val.squeeze()).clamp_(-1, 1), 2) * -1.

            # 3.c train Q network
            optimizer.zero_grad()
            val.backward(d_error.data.unsqueeze(1))
            optimizer.step()

            # 3.d update target network
            num_param_updates += 1
            if num_param_updates % target_update_freq == 0:
                target_Q.load_state_dict(Q.state_dict())
            #####

        ### 4. Log progress and keep track of statistics
        episode_rewards = get_wrapper_by_name(env,
                                              "Monitor").get_episode_rewards()
        if len(episode_rewards) > 0:
            mean_episode_reward = np.mean(episode_rewards[-100:])
        if len(episode_rewards) > 100:
            best_mean_episode_reward = max(best_mean_episode_reward,
                                           mean_episode_reward)

        Statistic["mean_episode_rewards"].append(mean_episode_reward)
        Statistic["best_mean_episode_rewards"].append(best_mean_episode_reward)

        if t % LOG_EVERY_N_STEPS == 0 and t > learning_starts:
            print("Timestep %d" % (t, ))
            print(f"Iteration time:{time()-iter_time:.2f}")
            iter_time = time()
            print("mean reward (100 episodes) %f" % mean_episode_reward)
            print("best mean reward %f" % best_mean_episode_reward)
            print("episodes %d" % len(episode_rewards))
            print("exploration %f" % exploration.value(t))
            sys.stdout.flush()

            # Dump statistics to pickle
            filename = f"{t}" + 'statistics.pkl' if IN_COLAB else 'statistics.pkl'
            with open(filename, 'wb') as f:
                pickle.dump(Statistic, f)
                print("Saved to %s" % filename)
            if IN_COLAB and t % (LOG_EVERY_N_STEPS * 10) == 0:
                try:
                    stat_pkl = drive.CreateFile()
                    stat_pkl.SetContentFile(filename)
                    stat_pkl.Upload()
                    print("Uploaded to drive")
                except Exception:
                    print("Exception during upload to drive")