def run_episode(env, agent, deterministic, do_training=True, rendering=False, max_timesteps=1000): """ This methods runs one episode for a gym environment. deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions). do_training == True => train agent """ stats = EpisodeStats() # save statistics like episode reward or action usage state = env.reset() step = 0 while True: action_id = agent.act(state=state, deterministic=deterministic) next_state, reward, terminal, info = env.step(action_id) if do_training: agent.train(state, action_id, next_state, reward, terminal) stats.step(reward, action_id) state = next_state if rendering: env.render() if terminal or step > max_timesteps: break step += 1 return stats
def run_episode(env, agent, deterministic, skip_frames=0, do_training=True, rendering=False, max_timesteps=1000, history_length=0): """ This methods runs one episode for a gym environment. deterministic == True => agent executes only greedy actions according the Q function approximator (no random actions). do_training == True => train agent """ stats = EpisodeStats() # Save history image_hist = [] step = 0 state = env.reset() # fix bug of corrupted states without rendering in gym environment env.viewer.window.dispatch_events() # append image history to first state state = state_preprocessing(state) image_hist.extend([state] * (history_length + 1)) state = np.array(image_hist).reshape(96, 96, history_length + 1) while True: # TODO: get action_id from agent # Hint: adapt the probabilities of the 5 actions for random sampling so that the agent explores properly. # action_id = agent.act(...) # action = your_id_to_action_method(...) # Hint: frame skipping might help you to get better results. reward = 0 for _ in range(skip_frames + 1): next_state, r, terminal, info = env.step(action) reward += r if rendering: env.render() if terminal: break next_state = state_preprocessing(next_state) image_hist.append(next_state) image_hist.pop(0) next_state = np.array(image_hist).reshape(96, 96, history_length + 1) if do_training: agent.train(state, action_id, next_state, reward, terminal) stats.step(reward, action_id) state = next_state if terminal or (step * (skip_frames + 1)) > max_timesteps : break step += 1 return stats