def play(env_name: str, manual_control: bool, max_steps: int): # Make environment env = TimeLimit(gym.make(env_name, render=True), max_steps) observation = env.reset() if manual_control: # Create user debug interface import pybullet as p params = [ p.addUserDebugParameter( p.getJointInfo(env.robot_id, j)[1].decode(), -1, 1, 0) for j in env.joint_list ] reward_sum = 0 while True: if manual_control: # Read user input and simulate motor a = [p.readUserDebugParameter(param) for param in params] else: a = env.action_space.sample() observation, reward, done, _ = env.step(a) reward_sum += reward print("\nobservation", observation) print("reward", reward) print("total reward", reward_sum) print("done", done) # Reset when done if done: observation = env.reset() reward_sum = 0 env.close()
def test_basics(): env = TimeLimit(gym.make("CartPole-v0"), max_episode_steps=10) env = EnvDataset(env) env = EpisodeLimit(env, max_episodes=3) env.seed(123) for episode in range(3): obs = env.reset() done = False step = 0 while not done: print(f"step {step}") obs, reward, done, info = env.step(env.action_space.sample()) step += 1 assert env.is_closed() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.reset() with pytest.raises(gym.error.ClosedEnvironmentError): _ = env.step(env.action_space.sample()) with pytest.raises(gym.error.ClosedEnvironmentError): for _ in env: break
def test_random_task_on_each_episode(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: {"level": 0}, 5: {"level": 1}, 200: {"level": 2}, 300: {"level": 3}, 400: {"level": 4}, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] for i in range(10): obs = env.reset() task_labels.append(obs["task_labels"]) assert len(set(task_labels)) > 1 # Episodes only last 10 steps. Tasks don't have anything to do with the task # schedule. obs = env.reset() start_task_label = obs["task_labels"] for i in range(10): obs, reward, done, info = env.step(env.action_space.sample()) assert obs["task_labels"] == start_task_label if i == 9: assert done else: assert not done env.close()
def test_noop_reset_env(self): # runable test noop_max = 20 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 3) env = atari.NoopResetEnv(env, noop_max=noop_max) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break
def test_task_schedule_monsterkong(): env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment(env, task_schedule={ 0: { "level": 0 }, 100: { "level": 1 }, 200: { "level": 2 }, 300: { "level": 3 }, 400: { "level": 4 }, }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset() env.close()
def test_time_limit_reset_info(): env = gym.make("CartPole-v1") env = TimeLimit(env) ob_space = env.observation_space obs = env.reset() assert ob_space.contains(obs) del obs obs = env.reset(return_info=False) assert ob_space.contains(obs) del obs obs, info = env.reset(return_info=True) assert ob_space.contains(obs) assert isinstance(info, dict)
def test_max_and_skip_env(self): # runable test skip = 4 env = gym.make(TEST_ENV_ID) env = TimeLimit(env, 20) env = atari.MaxAndSkipEnv(env, skip=skip) env.seed(1) ub_utils.set_seed(1) env.reset() for i in range(20): obs, rew, done, info = env.step(env.action_space.sample()) if done: break self.assertEqual(4, i)
def play_one_session( env: TimeLimit, max_size: int, action_chooser: Callable[[TimeLimit, Any], Any], render: bool = False, custom_actions: Callable[[int, TimeLimit, Any, Any, Any, bool, Any], None] = None, stop_when_done: bool = True, ) -> Tuple[float, List[Dict[str, Any]]]: observation = env.reset() score = 0 history = [] for i in range(max_size): if render: env.render() action = action_chooser(env, observation) current_iteration_history = {"observation": observation, "action": action} observation, reward, done, info = env.step(action.reshape((-1,))) score += reward history.append(current_iteration_history) if custom_actions is not None: custom_actions(i, env, action, observation, reward, done, info) if stop_when_done and done: break return score / max_size, history
def test_random_task_on_each_episode_and_only_one_task_in_schedule(): """ BUG: When the goal is to have only one task, it instead keeps sampling a new task from the 'distribution', in the case of cartpole! """ env: MetaMonsterKongEnv = gym.make("CartPole-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) env = MultiTaskEnvironment( env, task_schedule={ 0: { "length": 0.1 }, }, add_task_id_to_obs=True, new_random_task_on_reset=True, ) task_labels = [] lengths = [] for i in range(10): obs = env.reset() task_labels.append(obs[1]) lengths.append(env.length) done = False while not done: obs, reward, done, info = env.step(env.action_space.sample()) task_labels.append(obs[1]) lengths.append(env.length) assert set(task_labels) == {0} assert set(lengths) == {0.1}
def play_with_car(): maximum_steps_allowed = 250 env = TimeLimit(MountainCarEnv(), max_episode_steps=maximum_steps_allowed + 1) actions = {'left': 0, 'stop': 1, 'right': 2} initial_state = env.reset() print('Initial state: ', initial_state) for t in range(maximum_steps_allowed): # need to modify policy if t < 50: s, r, done, _ = env.step(actions['left']) elif t < 70: s, r, done, _ = env.step(actions['right']) elif t < 120: s, r, done, _ = env.step(actions['left']) else: s, r, done, _ = env.step(actions['right']) print('State {}, Reward {}, Step {}'.format(s, r, t)) env.render() if done: if s[0] > 0.47: print('Well done!') else: print('Please, try again.') break else: print('Time is up. Please, try again.') env.close()
def test_monitor(n_episodes): steps = 15 env = gym.make("CartPole-v1") # unwrap default TimeLimit and wrap with new one to simulate done=True # at step 5 assert isinstance(env, TimeLimit) env = env.env # unwrap env = TimeLimit(env, max_episode_steps=5) # wrap tmpdir = tempfile.mkdtemp() try: env = pfrl.wrappers.Monitor( env, directory=tmpdir, video_callable=lambda episode_id: True ) episode_idx = 0 episode_len = 0 t = 0 _ = env.reset() while True: _, _, done, info = env.step(env.action_space.sample()) episode_len += 1 t += 1 if episode_idx == 1 and episode_len >= 3: info["needs_reset"] = True # simulate ContinuingTimeLimit if done or info.get("needs_reset", False) or t == steps: if episode_idx + 1 == n_episodes or t == steps: break env.reset() episode_idx += 1 episode_len = 0 # `env.close()` is called when `env` is gabage-collected # (or explicitly deleted/closed). del env # check if videos & meta files were generated files = os.listdir(tmpdir) mp4s = [f for f in files if f.endswith(".mp4")] metas = [f for f in files if f.endswith(".meta.json")] stats = [f for f in files if f.endswith(".stats.json")] manifests = [f for f in files if f.endswith(".manifest.json")] assert len(mp4s) == n_episodes assert len(metas) == n_episodes assert len(stats) == 1 assert len(manifests) == 1 finally: shutil.rmtree(tmpdir)
def test_task_schedule_with_callables(): """ Apply functions to the env at a given step. """ env: MetaMonsterKongEnv = gym.make("MetaMonsterKong-v1") from gym.wrappers import TimeLimit env = TimeLimit(env, max_episode_steps=10) from operator import methodcaller env = MultiTaskEnvironment(env, task_schedule={ 0: methodcaller("set_level", 0), 100: methodcaller("set_level", 1), 200: methodcaller("set_level", 2), 300: methodcaller("set_level", 3), 400: methodcaller("set_level", 4), }, add_task_id_to_obs=True) obs = env.reset() # img, task_labels = obs assert obs[1] == 0 assert env.get_level() == 0 for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == i // 100 assert env.level == i // 100 env.render() assert isinstance(done, bool) if done: print(f"End of episode at step {i}") obs = env.reset() assert obs[1] == 4 assert env.level == 4 # level stays the same even after reaching that objective. for i in range(500): obs, reward, done, info = env.step(env.action_space.sample()) assert obs[1] == 4 assert env.level == 4 env.render() if done: print(f"End of episode at step {i}") obs = env.reset()
def test_change_gravity_each_step(self): env: ModifiedMassEnv = self.Environment() max_episode_steps = 500 n_episodes = 5 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) env: ModifiedMassEnv total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state body_part = self.body_names[0] start_mass = env.get_mass(body_part) while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 env.set_mass(body_part=body_part, mass=start_mass + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps == n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] assert initial_z == 0 # Check that the robot is high up in the sky! :D assert final_z > 20
def test_change_gravity_each_step(self): env: ModifiedGravityEnv = self.Environment() max_episode_steps = 50 n_episodes = 3 # NOTE: Interestingly, the renderer will show # `env.frame_skip * max_episode_steps` frames per episode, even when # "Ren[d]er every frame" is set to False. env = TimeLimit(env, max_episode_steps=max_episode_steps) total_steps = 0 for episode in range(n_episodes): initial_state = env.reset() done = False episode_steps = 0 start_y = initial_state[1] moved_up = 0 previous_state = initial_state state = initial_state while not done: previous_state = state state, reward, done, info = env.step(env.action_space.sample()) env.render("human") episode_steps += 1 total_steps += 1 # decrease the gravity continually over time. # By the end, things should be floating. env.set_gravity(-10 + 5 * total_steps / max_episode_steps) moved_up += (state[1] > previous_state[1]) # print(f"Moving upward? {obs[1] > state[1]}") if episode_steps != max_episode_steps: print(f"Episode ended early?") print(f"Gravity at end of episode: {env.gravity}") # TODO: Check that the position (in the observation) is obeying gravity? # if env.gravity <= 0: # # Downward force, so should not have any significant preference for # # moving up vs moving down. # assert 0.4 <= (moved_up / max_episode_steps) <= 0.6, env.gravity # # if env.gravity == 0: # # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0 # if env.gravity > 0: # assert 0.5 <= (moved_up / max_episode_steps) <= 1.0, env.gravity assert total_steps <= n_episodes * max_episode_steps initial_z = env.init_qpos[1] final_z = env.sim.data.qpos[1] if env.gravity > 0: assert final_z > initial_z
def test(pkl_path, pth_path, env, attempts, display=False, video_dir=None): with open(pkl_path, 'rb') as f: logs = pickle.load(f) if logs['params']['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=logs['params']['max_episode_steps']) if video_dir: if not os.path.exists(video_dir): os.makedirs(video_dir) env = Monitor(env, video_dir, force=True) if logs['agent'] == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **logs['params']) agent.epsilon = 0 elif logs['agent'] == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **logs['params']) elif logs['agent'] == 'random': agent = RandomAgent(env.observation_space, env.action_space, **logs['params']) agent.load(pth_path) try: rewards = [] for attempt in range(attempts): state = env.reset() sum_reward = 0 t = 0 done = False while not done: action = agent.get_action(state) next_state, reward, done, _ = env.step(action) state = next_state sum_reward += reward t += 1 if display: title = f'Attempt: {attempt+1} | Timestep: {t} | Reward: {reward} | Sum Reward: {sum_reward}' render(env, title) rewards.append(sum_reward) env.close() return rewards except Exception: traceback.print_exc() breakpoint() env.close()
def main(): env = make_mdp(args.mdp, episodic=True) env = TimeLimit(env, 10) env.reset() for t in itt.count(): print('---') print(f't: {t}') print('state:') env.render() action = policy(env, log=True) _, reward, done, _ = env.step(action) print(f'reward: {reward}') if done: print('final state:') env.render() print(f'Episode finished after {t+1} timesteps') break env.close()
def run_episodes(neps, seed): reward_fn = 'task1_reward' termination_fn = 'pos_and_rot_close_to_goal' # termination_fn = 'position_close_to_goal' initializer = 'task4_init' env = make_training_env(reward_fn, termination_fn, initializer, action_space='torque_and_position', init_joint_conf=True, visualization=True, grasp='pinch', rank=seed) env = env.env # HACK to remove FLatObservationWrapper # tmp_dir = '/tmp/video' # env = Monitor(RenderWrapper(TimeLimit(env, 1000)), tmp_dir, # video_callable=lambda episode_id: True, mode='evaluation', # force=True) env = TimeLimit(env, 1000) viz = Viz() for _ in range(neps): obs = env.reset() p.configureDebugVisualizer(p.COV_ENABLE_GUI, 0) p.resetDebugVisualizerCamera(cameraDistance=0.6, cameraYaw=0, cameraPitch=-40, cameraTargetPosition=[0, 0, 0]) viz.reset(obs) # tip_pd = TipPD([10, 1], 0.7 * env.cube_tip_positions) tip_pd = None controller = ForceControlPolicy(env, True, tip_pd) # obs = grasp_force_control(env, obs, controller.get_grasp_torque) obs = grasp_tippos_control(env, obs) # Then move toward the goal positions env.unwrapped.action_space = TriFingerPlatform.spaces.robot_torque.gym env.unwrapped.action_type = cube_env.ActionType.TORQUE done = False while not done: # transform wrenches to base frame torque = controller(obs) obs, reward, done, info = env.step(torque) viz.update_cube_orientation(obs) time.sleep(0.01) env.close()
def main(): env = make_cmdp(args.cmdp, episodic=True) env = TimeLimit(env, 10) agent_model_name = args.cmdp.split('/')[-1] agent_model = agent_models.get_agent_model(agent_model_name) values_df_index = 'E[G]', 'E[G | A=a]', 'E[G | do(A=a)]' values_df_columns = env.model.actions _, state = env.reset() for t in itt.count(): print() print(f't: {t}') env.render() Qs_none = [ infer_Q(env, action, 'none', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_condition = [ infer_Q(env, action, 'condition', agent_model=agent_model).item() for action in range(env.action_space.n) ] Qs_intervention = [ infer_Q(env, action, 'intervention', agent_model=agent_model).item() for action in range(env.action_space.n) ] values_df = pd.DataFrame( [Qs_none, Qs_condition, Qs_intervention], values_df_index, values_df_columns, ) print(values_df) action = torch.tensor(Qs_intervention).argmax() state, _, done, _ = env.step(action) if done: print() print(f'final state: {state}') print(f'Episode finished after {t+1} timesteps') break env.close()
class Renderer: def __init__(self, args): self.env = TimeLimit(gym.make(args.env), max_episode_steps=args.max_steps) def get_action(self, obs, ch): raise NotImplementedError def reset(self, init_obs): pass def main_loop(self, window): obs = self.env.reset() self.reset(obs) done = False action = None reward = None steps = 0 ret = 0 while not done: self.display(action, done, ret, reward, steps, window) ch = window.getch() action = self.get_action(obs, ch) obs, reward, done, _ = self.env.step(action) ret += reward steps += 1 # Clear screen self.display(action, done, ret, reward, steps, window) window.getch() def display(self, action, done, ret, reward, steps, window): show( self.env, window, { 'steps': steps, 'action': gym_psketch.ID2ACTIONS[action] if action is not None else action, 'reward': reward, 'return': ret, 'done': done })
class BaseTestRotMAB: """Base test class for RotMAB environment.""" def __init__(self, winning_probs, max_steps): """Initialize test class.""" self.winning_probs = winning_probs self.max_steps = max_steps self.env = TimeLimit( NonMarkovianRotatingMAB(winning_probs=self.winning_probs), max_episode_steps=self.max_steps, ) def test_action_space(self): """Test action spaces.""" assert self.env.action_space == Discrete(len(self.winning_probs)) def test_observation_space(self): """Test observation spaces.""" assert self.env.observation_space == Discrete(2) def test_interaction(self): """Test interaction with Rotating MAB.""" self.env.seed() state = self.env.reset() assert state == 0 def assert_consistency(obs, reward): """Assert obs = 1 iff reward = 1.""" positive_reward = reward > 0.0 positive_obs = obs == 1 assert (positive_reward and positive_obs or (not positive_reward and not positive_obs)) for _i in range(self.max_steps - 1): action = self.env.action_space.sample() obs, reward, done, info = self.env.step(action) assert_consistency(obs, reward) assert not done # last action obs, reward, done, info = self.env.step(0) assert_consistency(obs, reward) assert done
def replay_memory(env: TimeLimit, memory: List[List[Any]]): for episode_memory in memory: env.reset() for action in episode_memory: env.step(action) env.render()
plt.plot(line, label=label) plt.legend() plt.show() return np.array(s_s).T #s_s = play(agt, play_env) #labels = ['x', 'v_x', 'cos(theta)', 'sin(theta)', 'thetadot'] #for label, line in zip(labels, s_s): # plt.plot(line, label=label) #plt.legend() #plt.show() s = env.reset() for step in range(100000): a = int(q.get_action(s)) sp, r, done, _ = env.step(a) agt.handle_transition(s, a, r, sp, done) s_s.append(s.detach().numpy()) s = sp if done: s = env.reset() done = False if (step % 1000) == 0:
def get_cswm_data(env_name, seed, num_episodes=1000): logger.set_level(logger.INFO) env = gym.make(env_name) np.random.seed(seed) env.action_space.seed(seed) env.seed(seed) agent = RandomAgent(env.action_space) episode_count = num_episodes reward = 0 done = False crop = None warmstart = None if env_name == 'PongDeterministic-v4': crop = (35, 190) warmstart = 58 elif env_name == 'SpaceInvadersDeterministic-v4': crop = (30, 200) warmstart = 50 else: crop = (35, 190) warmstart = 58 max_episode_steps = warmstart + 11 env = TimeLimit(env, max_episode_steps=max_episode_steps) env = AtariARIWrapper(env) replay_buffer = [] for i in range(episode_count): replay_buffer.append({ 'obs': [], 'action': [], 'next_obs': [], 'label': [] }) ob = env.reset() # Burn-in steps for _ in range(warmstart): action = agent.act(ob, reward, done) ob, _, _, _ = env.step(action) prev_ob = crop_normalize(ob, crop) ob, _, _, info = env.step(0) ob = crop_normalize(ob, crop) while True: replay_buffer[i]['obs'].append( np.concatenate((ob, prev_ob), axis=0)) prev_ob = ob replay_buffer[i]["label"].append(info["labels"]) action = agent.act(ob, reward, done) ob, reward, done, info = env.step(action) ob = crop_normalize(ob, crop) replay_buffer[i]['action'].append(action) replay_buffer[i]['next_obs'].append( np.concatenate((ob, prev_ob), axis=0)) if done: break if i % 10 == 0: print("iter " + str(i)) return replay_buffer
# MODIFIED: Initializing learning rate anneal scheduler when need if args.anneal_lr: anneal_fn = lambda f: max(0, 1 - f / args.total_timesteps) pg_lr_scheduler = optim.lr_scheduler.LambdaLR(pg_optimizer, lr_lambda=anneal_fn) vf_lr_scheduler = optim.lr_scheduler.LambdaLR(v_optimizer, lr_lambda=anneal_fn) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game global_step = 0 while global_step < args.total_timesteps: if args.capture_video: env.stats_recorder.done = True next_obs = np.array(env.reset()) # ALGO Logic: Storage for epoch data obs = np.empty((args.batch_size, ) + env.observation_space.shape) actions = np.empty((args.batch_size, ) + env.action_space.shape) logprobs = torch.zeros(( env.action_space.nvec.shape[0], args.batch_size, )).to(device) rewards = np.zeros((args.batch_size, )) raw_rewards = np.zeros(( len(env.rfs), args.batch_size, ))
def train(agent_type, env, verbose=True, save_freq=50, save_dir='./', **params): if verbose: print(params) if agent_type == 'dqn': agent = DQNAgent(env.observation_space, env.action_space, **params) elif agent_type == 'a2c': agent = A2CAgent(env.observation_space, env.action_space, **params) elif agent_type == 'td3': agent = TD3Agent(env.observation_space, env.action_space, **params) elif agent_type == 'random': agent = RandomAgent(env.observation_space, env.action_space, **params) if params['max_episode_steps'] is not None: env = TimeLimit(env, max_episode_steps=params['max_episode_steps']) log = {'agent':agent_type, 'params':params, 'episodes':[]} if save_dir[-1] != '/': raise NotADirectory if not os.path.exists(save_dir): os.makedirs(save_dir) try: ep = 0 t_total = 0 while t_total < params['max_steps']: state = env.reset() sum_reward = 0 t_ep = 0 done = False while not done: if t_total > params['start_at']: action = agent.get_action(state) else: action = env.action_space.sample() next_state, reward, done, _ = env.step(action) agent.remember(state, action, reward, next_state, done) state = next_state sum_reward += reward t_ep += 1 # for agents using online training if agent.online and t_total > params['start_at']: agent.learn() # for agents using offline training if not agent.online and t_total > params['start_at']: agent.learn() ep += 1 t_total += t_ep ep_info = {'episode':ep, 't_ep':t_ep, 't_total':t_total, 'sum_reward':sum_reward, 'optim_steps':agent.optim_steps, 'memory':len(agent.memory)} log['episodes'].append(ep_info) if verbose: print(ep_info) if ep % save_freq == 0: agent.save(save_dir + params['file_name'] + '.pth') with open(save_dir + params['file_name'] + '.pkl', 'wb') as f: pickle.dump(log, f) if verbose: print('Episode ' + str(ep) + ': Saved model weights and log.') env.close() except Exception: traceback.print_exc() breakpoint()
return step max_action = float(env.action_space.high[0]) rb = ReplayBuffer(args.buffer_size) actor = Actor(env).to(device) qf1 = QNetwork(env).to(device) qf1_target = QNetwork(env).to(device) target_actor = Actor(env).to(device) target_actor.load_state_dict(actor.state_dict()) qf1_target.load_state_dict(qf1.state_dict()) q_optimizer = optim.Adam(list(qf1.parameters()), lr=args.learning_rate) actor_optimizer = optim.Adam(list(actor.parameters()), lr=args.learning_rate) loss_fn = nn.MSELoss() # TRY NOT TO MODIFY: start the game obs = env.reset() episode_reward = 0 for global_step in range(args.total_timesteps): # ALGO LOGIC: put action logic here if global_step < args.learning_starts: action = env.action_space.sample() else: action = actor.forward(obs.reshape((1, ) + obs.shape), device) if args.use_levy: action = (action.tolist()[0] + sampleFromLevy( args.levy_mu, args.levy_scale, env.action_space)).clip( env.action_space.low, env.action_space.high) else:
def evaluate( env: TimeLimit, total_episodes: int, *, q_table: np.ndarray = None, winning_reward: float = None, is_random: bool = False, render: bool = False, display_result: bool = False, ) -> float: """ Evaluate the performance of a q-table to solve a gym environment problem It may also use random instead of a q-table in order to compare the performance of a q-table against a random solution :param env: gym environment to solve :param total_episodes: number of time to repeat the evaluation. The bigger the more statistically significant the output will be :param q_table: Q-table to used solve the problem if given, is_random must be False :param winning_reward: the reward given to the agent when it solves the problem. It is used to compute the number of time the agent solved the problem :param is_random: if True will use random instead of Q-table. If True, q-table must not be given :param render: if True will call env.render() :param display_result: If True, prints evaluation summary in the console at the evaluation end """ # Todo : rename and re-think is_random parameter into policy parameter # Todo : render only last evaluation # Todo : yield q-table, evaluate it and continue evaluation if it is not good enough if (q_table is not None) and is_random: raise RuntimeError("is_random and q_table given") elif q_table is None and is_random is None: raise RuntimeError( "at least one of q_table and is_random must be given") total_epochs, total_reward, total_won_episodes = 0, 0, 0 for _ in range(total_episodes): state = env.reset() if render: env.render() done = False while not done: if is_random: action = env.action_space.sample() else: action = np.argmax(q_table[state, :]) state, reward, done, info = env.step(action) total_epochs += 1 total_reward += reward if render: env.render() # noinspection PyUnboundLocalVariable if reward == winning_reward: total_won_episodes += 1 score = round(total_won_episodes * 100 / total_episodes, 2) if display_result: print("-" * 30) print( f"Results after {total_episodes} episodes using {'random' if is_random else 'q_table'}:" ) print(f"Average steps per episode: {total_epochs / total_episodes}") print(f"Average reward per episode: {total_reward / total_episodes}") print(f"Percentage of won episodes : {score}%") return score