Exemple #1
0
 state_buffer.appendleft(last_processed_frame)
 
 if len(state_buffer) == phi_state_size:
     # Stack the last 4 processed frames where an action was selected to form the new state
     new_state = np.stack(state_buffer, axis=-1)
     
     if state is not None:
         # Record previous state, new state, action taken and reward observed in the replay buffer
         replay_buffer.add(state, action, reward_from_action, new_state)
     
     # Get the agent to learn if we've experienced enough random actions
     if frame_number > min_experience_frame_limit:
         
         # Only update CNN weights if we are due to
         if action_selects_until_weight_update == 0:
             minibatch = replay_buffer.get_random_minibatch()
             agent.learn(minibatch)
             action_selects_until_weight_update = cnn_learning_interval
         else:
             # Agent will be selecting an action, coundown to updating the CNN weights
             action_selects_until_weight_update -= 1
     
     # New state becomes old state
     state = new_state
                 
 # Select the action to take for the next four frames, act randomly if we've not primed the replay buffer
 if state is None or frame_number <= min_experience_frame_limit:
     action = agent.select_epsilon_greedy_action()
 else:
     action = agent.select_epsilon_greedy_action(state[np.newaxis, ...])
 action_num_left_to_repeat = action_frame_repeat-1