def test_evaluate_actions_sizes() -> None: """ Test the sizes of returned tensors from ppo.evaluate_actions(). """ settings = dict(DEFAULT_SETTINGS) env = get_env(settings["env_name"], settings["num_processes"]) policy = get_policy(env, settings) minibatch_size = (settings["rollout_length"] * settings["num_processes"] // settings["num_minibatch"]) obs_list = [ torch.Tensor(env.observation_space.sample()) for _ in range(minibatch_size) ] obs_batch = torch.stack(obs_list) actions_list = [ torch.Tensor([float(env.action_space.sample())]) for _ in range(minibatch_size) ] actions_batch = torch.stack(actions_list) value_pred, action_log_prob, action_dist_entropy, _ = policy.evaluate_actions( obs_batch, None, actions_batch, None) assert isinstance(value_pred, torch.Tensor) assert value_pred.shape == torch.Size([minibatch_size]) assert isinstance(action_log_prob, torch.Tensor) assert action_log_prob.shape == torch.Size([minibatch_size]) assert isinstance(action_log_prob, torch.Tensor) assert action_dist_entropy.shape == torch.Size([minibatch_size])
def test_get_value_sizes() -> None: """ Test the sizes of returned tensors from ppo.get_value(). """ settings = dict(DEFAULT_SETTINGS) env = get_env(settings["env_name"], settings["num_processes"]) policy = get_policy(env, settings) obs = torch.Tensor(env.observation_space.sample()) value_pred = policy.get_value(obs, None, None) assert isinstance(value_pred, torch.Tensor) assert value_pred.shape == torch.Size([1])
def get_metaworld_rollout( settings: Dict[str, Any]) -> Tuple[RolloutStorage, np.ndarray]: """ Execute and return a single rollout over a MetaWorld environment using configuration in `settings`. """ # Construct environment and policy. env = get_env( settings["env_name"], num_processes=settings["num_processes"], seed=settings["seed"], time_limit=settings["time_limit"], normalize_transition=settings["normalize_transition"], normalize_first_n=settings["normalize_first_n"], allow_early_resets=True, same_np_seed=settings["same_np_seed"], add_observability=settings["add_observability"], save_memory=settings["save_memory"], ) policy = get_policy(env, settings) rollout = RolloutStorage( rollout_length=settings["rollout_length"], observation_space=env.observation_space, action_space=env.action_space, num_processes=settings["num_processes"], hidden_state_size=1, device=settings["device"], ) rollout.set_initial_obs(env.reset()) # Collect rollout. for rollout_step in range(rollout.rollout_length): # Sample actions. with torch.no_grad(): values, actions, action_log_probs, hidden_states = policy.act( rollout.obs[rollout_step], rollout.hidden_states[rollout_step], rollout.dones[rollout_step], ) # Perform step and record in ``rollout``. obs, rewards, dones, infos = env.step(actions) rollout.add_step(obs, actions, dones, action_log_probs, values, rewards, hidden_states) env.close() return rollout
def test_collect_rollout_values() -> None: """ Test the values of the returned RolloutStorage objects from train.collect_rollout(). """ settings = dict(DEFAULT_SETTINGS) settings["env_name"] = "unique-env" env = get_env( settings["env_name"], normalize_transition=settings["normalize_transition"], allow_early_resets=True, ) policy = get_policy(env, settings) rollout = RolloutStorage( rollout_length=settings["rollout_length"], observation_space=env.observation_space, action_space=env.action_space, num_processes=settings["num_processes"], hidden_state_size=1, device=settings["device"], ) rollout.set_initial_obs(env.reset()) rollout, _, _ = collect_rollout(rollout, env, policy) # Check if rollout info came from UniqueEnv. for step in range(rollout.rollout_step): obs = rollout.obs[step] value_pred = rollout.value_preds[step] action = rollout.actions[step] action_log_prob = rollout.action_log_probs[step] reward = rollout.rewards[step] # Check shapes. assert obs.shape == torch.Size([settings["num_processes"], 1]) assert value_pred.shape == torch.Size([settings["num_processes"], 1]) assert action.shape == torch.Size([settings["num_processes"], 1]) assert action_log_prob.shape == torch.Size( [settings["num_processes"], 1]) assert reward.shape == torch.Size([settings["num_processes"], 1]) # Check consistency of values. assert float(obs) == float(step + 1) assert float(action) - int(action) == 0 and int( action) in env.action_space assert float(obs) == float(reward) env.close()
def test_lr_schedule_cosine() -> None: """ Tests learning rate schedule in the case where the schedule type is cosine. """ # Initialize environment and policy. settings = dict(DEFAULT_SETTINGS) settings["lr_schedule_type"] = "cosine" env = get_env(settings["env_name"], settings["num_processes"], allow_early_resets=True) policy = get_policy(env, settings) # Define helper function to check learning rate. def check_lr(optimizer: Optimizer, lr: float) -> None: for param_group in optimizer.param_groups: assert abs(param_group["lr"] - lr) < TOL # Run training and test values of learning rate along the way. check_lr(policy.optimizer, settings["initial_lr"]) for i in range(settings["num_updates"]): # Perform update. rollout = get_rollout( env, policy, settings["num_episodes"], settings["episode_len"], settings["num_processes"], settings["device"], ) for step_loss in policy.get_loss(rollout): step_loss.backward() policy.optimizer.step() policy.after_step() # Check learning rate. interval_pos = math.pi * float(i + 1) / settings["num_updates"] offset = (0.5 * (settings["initial_lr"] - settings["final_lr"]) * (1.0 + math.cos(interval_pos))) expected_lr = settings["final_lr"] + offset check_lr(policy.optimizer, expected_lr)
def test_update_values() -> None: """ Tests whether PPOPolicy.get_loss() calculates correct updates in the case of a linear actor/critic network and a dummy environment. """ # Initialize environment and policy. settings = dict(DEFAULT_SETTINGS) env = get_env(settings["env_name"], settings["num_processes"], allow_early_resets=True) policy = get_policy(env, settings) # Initialize rollout storage. rollout = get_rollout( env, policy, settings["num_episodes"], settings["episode_len"], settings["num_processes"], settings["device"], ) # Compute expected losses. expected_loss_items = get_losses(rollout, policy, settings) # Compute actual losses. actual_loss = 0 for step_loss in policy.get_loss(rollout): actual_loss += step_loss.item() step_loss.backward() policy.optimizer.step() policy.after_step() # Compare expected vs. actual. diff = abs(float(actual_loss - expected_loss_items["total"])) print("loss diff: %.8f" % diff) assert diff < BIG_TOL
def test_lr_schedule_null() -> None: """ Tests learning rate schedule in the case where no schedule type is given (learning rate should be constant). """ # Initialize environment and policy. settings = dict(DEFAULT_SETTINGS) env = get_env(settings["env_name"], settings["num_processes"], allow_early_resets=True) policy = get_policy(env, settings) # Define helper function to check learning rate. def check_lr(optimizer: Optimizer, lr: float) -> None: for param_group in optimizer.param_groups: assert param_group["lr"] == lr # Run training and test values of learning rate along the way. check_lr(policy.optimizer, settings["initial_lr"]) for _ in range(settings["num_updates"]): # Perform update. rollout = get_rollout( env, policy, settings["num_episodes"], settings["episode_len"], settings["num_processes"], settings["device"], ) for step_loss in policy.get_loss(rollout): step_loss.backward() policy.optimizer.step() policy.after_step() # Check learning rate. check_lr(policy.optimizer, settings["initial_lr"])
def test_multitask_losses() -> None: """ Tests that PPOPolicy.get_loss() correctly computes task specific losses when multi-task training. """ # Initialize environment and policy. Note that we set `normalize_first_n` to 39, # since it is the size of the total observation minus the number of tasks. We also # set `normalize_advantages` to False, as this makes it possible to compute the task # specific losses while only considering each task's own transitions. Changing this # setting to True will cause this test to fail. settings = dict(DEFAULT_SETTINGS) settings["env_name"] = "MT10" settings["num_tasks"] = get_num_tasks(settings["env_name"]) settings["num_processes"] = 10 settings["num_episodes"] = 1 settings["episode_len"] = 100 settings["normalize_advantages"] = False env = get_env( settings["env_name"], settings["num_processes"], allow_early_resets=True, normalize_first_n=39, ) policy = get_policy(env, settings) # Initialize rollout and task specific rollouts. rollout, task_rollouts = get_task_rollouts( env, policy, settings["num_tasks"], settings["num_episodes"], settings["episode_len"], settings["num_processes"], settings["device"], ) # Compute expected task losses. expected_task_losses = [] for task, task_rollout in enumerate(task_rollouts): if task_rollout is None: expected_task_losses.append(0) else: task_settings = dict(settings) task_settings["num_processes"] = task_rollout.num_processes expected_loss_items = get_losses(task_rollout, policy, task_settings) expected_task_losses.append(expected_loss_items["total"]) # Compute actual losses. actual_task_losses = [0] * settings["num_tasks"] for step_loss in policy.get_loss(rollout): actual_task_losses = [ actual_task_losses[i] + step_loss[i].item() for i in range(settings["num_tasks"]) ] # Aggregate task losses to execute backward pass. step_loss = sum(step_loss) step_loss.backward() policy.optimizer.step() policy.after_step() # Compare expected vs. actual. for task in range(settings["num_tasks"]): diff = abs(actual_task_losses[task] - expected_task_losses[task]) print("loss diff: %.8f, %.8f, %.8f" % (actual_task_losses[task], expected_task_losses[task], diff)) assert diff < BIG_TOL