def play_one_session( env: TimeLimit, max_size: int, action_chooser: Callable[[TimeLimit, Any], Any], render: bool = False, custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None, stop_when_done: bool = True, ) -> Tuple[float, List[Dict[str, Any]]]: observation = env.reset() score = 0 history = [] for i in range(max_size): if render: env.render() action = action_chooser(env, observation) current_iteration_history = {"observation": observation, "action": action} observation, reward, done, info = env.step(action.reshape((-1,))) score += reward history.append(current_iteration_history) if custom_actions is not None: custom_actions(i, env, action, observation, reward, done, info) if stop_when_done and done: break return score / max_size, history
def play_with_car(): maximum_steps_allowed = 250 env = TimeLimit(MountainCarEnv(), max_episode_steps=maximum_steps_allowed + 1) actions = {'left': 0, 'stop': 1, 'right': 2} initial_state = env.reset() print('Initial state: ', initial_state) for t in range(maximum_steps_allowed): # need to modify policy if t < 50: s, r, done, _ = env.step(actions['left']) elif t < 70: s, r, done, _ = env.step(actions['right']) elif t < 120: s, r, done, _ = env.step(actions['left']) else: s, r, done, _ = env.step(actions['right']) print('State {}, Reward {}, Step {}'.format(s, r, t)) env.render() if done: if s[0] > 0.47: print('Well done!') else: print('Please, try again.') break else: print('Time is up. Please, try again.') env.close()
def test_change_gravity_each_step(self): env: ModifiedMassEnv = self.Environment() max_episode_steps = 500 n_episodes = 5 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) env: ModifiedMassEnv total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state body_part = self.body_names[0] start_mass = env.get_mass(body_part) while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps == n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] assert initial_z == 0 # Check that the robot is high up in the sky! :D assert final_z > 20
def test_change_gravity_each_step(self): env: ModifiedGravityEnv = self.Environment() max_episode_steps = 50 n_episodes = 3 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 # decrease the gravity continually over time. # By the end, things should be floating. env.set_gravity(-10 + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") if episode_steps != max_episode_steps: print(f"Episode ended early?") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps <= n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] if env.gravity > 0: assert final_z > initial_z
def test_task_schedule_monsterkong(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment(env, task_schedule={ 0: { "level": 0 }, 100: { "level": 1 }, 200: { "level": 2 }, 300: { "level": 3 }, 400: { "level": 4 }, }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset() env.close()
def main(): env = make_cmdp(args.cmdp, episodic=True) env = TimeLimit(env, 10) agent_model_name = args.cmdp.split('/')[-1] agent_model = agent_models.get_agent_model(agent_model_name) values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]' values_df_columns = env.model.actions _, state = env.reset() for t in itt.count(): print() print(f't: {t}') env.render() Qs_none = [ infer_Q(env, action, 'none', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_condition = [ infer_Q(env, action, 'condition', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_intervention = [ infer_Q(env, action, 'intervention', agent_model=agent_model).item() for action in range(env.action_space.n) ] values_df = pd.DataFrame( [Qs_none, Qs_condition, Qs_intervention], values_df_index, values_df_columns, ) print(values_df) action = torch.tensor(Qs_intervention).argmax() state, _, done, _ = env.step(action) if done: print() print(f'final state: {state}') print(f'Episode finished after {t+1} timesteps') break env.close()
def test_task_schedule_with_callables(): """ Apply functions to the env at a given step. """ env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) from operator import methodcaller env = MultiTaskEnvironment(env, task_schedule={ 0: methodcaller("set_level", 0), 100: methodcaller("set_level", 1), 200: methodcaller("set_level", 2), 300: methodcaller("set_level", 3), 400: methodcaller("set_level", 4), }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset()
def main(): env = make_mdp(args.mdp, episodic=True) env = TimeLimit(env, 10) env.reset() for t in itt.count(): print('---') print(f't: {t}') print('state:') env.render() action = policy(env, log=True) _, reward, done, _ = env.step(action) print(f'reward: {reward}') if done: print('final state:') env.render() print(f'Episode finished after {t+1} timesteps') break env.close()
raw_rewards = np.zeros(( len(env.rfs), args.batch_size, )) real_rewards = [] invalid_action_stats = [] dones = np.zeros((args.batch_size, )) values = torch.zeros((args.batch_size, )).to(device) invalid_action_masks = torch.zeros( (args.batch_size, env.action_space.nvec.sum())) # TRY NOT TO MODIFY: prepare the execution of the game. for step in range(args.batch_size): env.render() global_step += 1 obs[step] = next_obs.copy() # ALGO LOGIC: put action logic here invalid_action_mask = torch.ones(env.action_space.nvec.sum()) invalid_action_mask[0:env.action_space.nvec[0]] = torch.tensor( env.unit_location_mask) invalid_action_mask[-env.action_space.nvec[-1]:] = torch.tensor( env.target_unit_location_mask) invalid_action_masks[step] = invalid_action_mask with torch.no_grad(): values[step] = vf.forward(obs[step:step + 1]) action, logproba, _, probs = pg.get_action( obs[step:step + 1], invalid_action_masks=invalid_action_masks[step:step + 1])
def replay_memory(env: TimeLimit, memory: List[List[Any]]): for episode_memory in memory: env.reset() for action in episode_memory: env.step(action) env.render()
import time from rebar.learners.qlearner import QLearner from rebar.learners.adp import ADP import numpy as np import gym import torch from envs import Swingup, Reacher, InvertedDoublePendulum, InvertedPendulum, Walker from copy import deepcopy from gym.wrappers import TimeLimit import matplotlib.pyplot as plt env = Reacher eval_env = TimeLimit(deepcopy(env), max_episode_steps=500) play_env = TimeLimit(deepcopy(env), max_episode_steps=200) env = TimeLimit(deepcopy(env), max_episode_steps=1000) play_env.render() # Swingup state = <x, vx, cos(theta), sin(theta), thetadot> q = QLearner(action_space=env.action_space, observation_space=env.observation_space, Q='simple', opt_args={'lr': 0.01}, memory_len=1000, gamma=0.999, initial_epsilon=1., final_epsilon=0.01, exploration_steps=50000, target_lag=100) adp = ADP(action_space=env.action_space,
def evaluate( env: TimeLimit, total_episodes: int, *, q_table: np.ndarray = None, winning_reward: float = None, is_random: bool = False, render: bool = False, display_result: bool = False, ) -> float: """ Evaluate the performance of a q-table to solve a gym environment problem It may also use random instead of a q-table in order to compare the performance of a q-table against a random solution :param env: gym environment to solve :param total_episodes: number of time to repeat the evaluation. The bigger the more statistically significant the output will be :param q_table: Q-table to used solve the problem if given, is_random must be False :param winning_reward: the reward given to the agent when it solves the problem. It is used to compute the number of time the agent solved the problem :param is_random: if True will use random instead of Q-table. If True, q-table must not be given :param render: if True will call env.render() :param display_result: If True, prints evaluation summary in the console at the evaluation end """ # Todo : rename and re-think is_random parameter into policy parameter # Todo : render only last evaluation # Todo : yield q-table, evaluate it and continue evaluation if it is not good enough if (q_table is not None) and is_random: raise RuntimeError("is_random and q_table given") elif q_table is None and is_random is None: raise RuntimeError( "at least one of q_table and is_random must be given") total_epochs, total_reward, total_won_episodes = 0, 0, 0 for _ in range(total_episodes): state = env.reset() if render: env.render() done = False while not done: if is_random: action = env.action_space.sample() else: action = np.argmax(q_table[state, :]) state, reward, done, info = env.step(action) total_epochs += 1 total_reward += reward if render: env.render() # noinspection PyUnboundLocalVariable if reward == winning_reward: total_won_episodes += 1 score = round(total_won_episodes * 100 / total_episodes, 2) if display_result: print("-" * 30) print( f"Results after {total_episodes} episodes using {'random' if is_random else 'q_table'}:" ) print(f"Average steps per episode: {total_epochs / total_episodes}") print(f"Average reward per episode: {total_reward / total_episodes}") print(f"Percentage of won episodes : {score}%") return score