Beispiel #1
0
        u_value = critic(cur_state)
        u_value_list = torch.cat([u_value_list, u_value])

        # Update parameters of critic by TD(0)
        # TODO : Use TD Lambda here and compare the performance

        # target = reward + gamma * critic(next_state)
        # Using 1-done even in the target for actor since the next state wont have any meaning when done=1
        # TODO : Remove this line if 1-done is a wrong concept in actor
        target = reward + gamma * (1 - done) * critic_old(next_state)
        target_list = torch.cat([target_list, target])

        replay_buffer.add(cur_state, action, next_state, reward, done)
        # sample minibatch of transitions from the replay buffer
        # the sampling is done every timestep and not every episode
        sample_transitions = replay_buffer.sample_pytorch(sample_size=32)
        # update the critic's q approximation using the sampled transitions
        running_loss1_mean += update_critic(critic_old, **sample_transitions)

        # this section was for actor experience replay, which to my dismay performed much worse than without replay
        # actor_replay_buffer.add(target, u_value, -log_prob)
        # sample_objectives = actor_replay_buffer.sample(sample_size=32)
        # actor_optimizer.zero_grad()
        # # compute the gradient from the sampled log probability
        # #  the log probability times the Q of the action that you just took in that state
        # """Important note"""
        # # Reward scaling, this performs much better.
        # # In the general case this might not be a good idea. If there are rare events with extremely high rewards
        # # that only occur in some episodes, and the majority of episodes only experience common events with
        # # lower-scale rewards, then this trick will mess up training. In cartpole environment this is not of concern
        # # since all the rewards are 1 itself
        if done:
            if episode_timestep <= 170:
                reward = -500
            else:
                reward = 50
        else:
            reward = 20

        u_value = critic(cur_state)
        target = reward + gamma * (1 - done) * critic(next_state)

        replay_buffer.add(cur_state, action, next_state, reward, done)
        # sample minibatch of transitions from the replay buffer
        # the sampling is done every timestep and not every episode
        sample_transitions = replay_buffer.sample_pytorch()
        # update the critic's q approximation using the sampled transitions
        running_loss1_mean += update_critic(**sample_transitions)

        target_list = torch.cat([target_list, target])
        u_value_list = torch.cat([u_value_list, u_value])
        log_prob_list = torch.cat([log_prob_list, log_prob.reshape(-1)])

        episode_reward += reward
        episode_timestep += 1
        cur_state = next_state

    # Update parameters of actor by policy gradient
    actor_optimizer.zero_grad()
    # compute the gradient from the sampled log probability
    #  the log probability times the Q of the action that you just took in that state