def episode_encoded(AE, env_name='BreakoutDeterministic-v3', minimum_score=0, onehot=True, video=False): env = envs.Atari(env_name) action_space = env.action_space.n cumulative_reward = 0 while cumulative_reward <= minimum_score: cumulative_reward = 0 frame_counter = 0 # Get current state state = env.reset() # Get encoded features preprocessed_state = np.expand_dims(np.asarray(crop_state(state)), axis=0) encoded_state = AE.flat_encode(preprocessed_state) reward = 0 done = False # Start episode ep_output = [] while not done: frame_counter += 1 # Select an action action = random.randrange(0, action_space) # Execute the action, get next state and reward next_state, reward, done, info = env.step(action) cumulative_reward += reward # Get encoded features preprocessed_next_state = np.expand_dims(crop_state(next_state), axis=0) encoded_next_state = AE.flat_encode(preprocessed_next_state) # Append sars tuple to datset actions_to_append = onehot_encode(action, action_space) if onehot else action sars_list = [encoded_state, actions_to_append, reward, encoded_next_state, [1 if done else 0] * 2] ep_output.append(flat2list(sars_list, as_tuple=True)) # Render environment if video: env.render() # Update state state = next_state encoded_state = encoded_next_state return ep_output
def _eval_with_FE(mdp, policy, AE, metric, selected_states=None, max_ep_len=np.inf, render=False): gamma = mdp.gamma if metric == 'discounted' else 1 ep_performance = 0.0 df = 1.0 # Discount factor # Start episode reward = 0 done = False state = mdp.reset() # Get encoded features preprocessed_state = np.expand_dims(np.asarray(crop_state(state)), axis=0) encoded_state = AE.flat_encode(preprocessed_state) if selected_states is not None: filtered_state = filter_state_with_RFS(encoded_state, selected_states) else: filtered_state = encoded_state frame_counter = 0 while not done and frame_counter <= max_ep_len: frame_counter += 1 # Select an action action = policy.draw_action(filtered_state, done, evaluation=True) # Execute the action, get next state and reward next_state, reward, done, info = mdp.step(action) ep_performance += df * reward # Update performance df *= gamma # Update discount factor # Get encoded features preprocessed_next_state = np.expand_dims(crop_state(next_state), axis=0) encoded_next_state = AE.flat_encode(preprocessed_next_state) if selected_states is not None: filtered_next_state = filter_state_with_RFS(encoded_next_state, selected_states) else: filtered_next_state = encoded_next_state # Render environment if render: mdp.render(mode='human') # Update state state = next_state filtered_state = filtered_next_state if metric == 'average': ep_performance /= frame_counter return ep_performance, frame_counter
def episode_images(episode_id, logger, env_name='BreakoutDeterministic-v3', video=False): env = envs.Atari(env_name) action_space = env.action_space.n frame_counter = 0 # Get current state state = env.reset() # Save image of state state_id = '%04d_%d' % (episode_id, frame_counter) np.save(logger.path + state_id, crop_state(state)) reward = 0 done = False # Start episode ep_output = [] while not done: frame_counter += 1 # Select an action action = random.randrange(0, action_space) # Execute the action, get next state and reward next_state, reward, done, info = env.step(action) # Save image of state next_state_id = '%04d_%d' % (episode_id, frame_counter) np.save(logger.path + next_state_id, crop_state(next_state)) ep_output.append([state_id, action, reward, next_state_id]) # Render environment if video: env.render() # Update state state = next_state state_id = next_state_id return ep_output