def play(sim: Driver, optimal_ctrl): """ Renders trajectory for user. """ sim.set_ctrl(optimal_ctrl) keep_playing = "y" while keep_playing == "y": keep_playing = "u" sim.watch(1) while keep_playing != "n" and keep_playing != "y": keep_playing = input("Again? [y/n]: ").lower() return optimal_ctrl
def collect( outdir: Path, n_rewards: int, test_reward_path: Optional[Path] = None, std: Optional[float] = None, mean_reward_path: Optional[Path] = None, normals_paths: Optional[List[Path]] = None, preferences_paths: Optional[List[Path]] = None, use_random: bool = False, use_plausible: bool = False, skip_human: bool = False, overwrite: bool = False, ) -> None: """Collects ground truth labels for the optimal trajectories of some reward functions. Args: outdir (Path): Directory to write output to n_rewards (int): Number of rewards to generate or process test_reward_path (Optional[Path], optional): Path to nupmy array of reward weights to test. Defaults to None. std (Optional[float], optional): Standard deviation of normal distribution to draw test reward weigths from. Defaults to None. mean_reward_path (Optional[Path], optional): Path to numpy array specifying mean reward weights to sample around. Defaults to None. overwrite (bool, optional): Overwrite output? Defaults to False. Raises: ValueError: Raised if neither test_reward_path or both std and mean_reward_path are specified. The test rewards need to come from somewhere. """ outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) out_rewards = load(outdir, "test_rewards.npy", overwrite=overwrite) new_rewards_index = out_rewards.shape[0] if out_rewards is not None else 0 num_new_rewards = n_rewards - new_rewards_index env = Driver() if num_new_rewards > 0: if test_reward_path is not None: rewards = np.load( test_reward_path)[new_rewards_index:num_new_rewards] elif mean_reward_path is not None and std is not None: mean_reward = np.load(mean_reward_path) rewards = default_rng().normal(loc=mean_reward, scale=std, size=(num_new_rewards, *mean_reward.shape)) elif normals_paths is not None and preferences_paths is not None and std is not None: # NOTE(joschnei): This turned out not to work, because the random baseline is poisoning the well normals = None for normals_path, preferences_path in zip(normals_paths, preferences_paths): single_normals = np.load(normals_path) single_preferences = np.load(preferences_path) single_normals = (single_normals.T * single_preferences).T normals = append(normals, single_normals, flat=True) # TODO(joschnei): These can all be loaded in from flags.pkl, but I'm too lazy for that. mean_reward = make_mode_reward( query_type="strict", true_delta=1.1, w_sampler=Sampler(env.num_of_features), n_reward_samples=100, ) assert np.all(np.isfinite(mean_reward)) rewards = default_rng().normal(loc=mean_reward, scale=std, size=(num_new_rewards, *mean_reward.shape)) assert np.all(np.isfinite(rewards)) elif use_random: rewards = default_rng().normal(loc=0, scale=1, size=(num_new_rewards, env.num_of_features)) rewards = rewards / np.linalg.norm(rewards) elif use_plausible: # Generate uniform rewards with plausible weights i.e. ones with the right sign rewards = default_rng().normal(loc=0, scale=1, size=(num_new_rewards, env.num_of_features)) rewards = rewards / np.linalg.norm(rewards) # See models.py for reward feature details. rewards[:, 0] = np.abs(rewards[:, 0]) rewards[:, 1] = -np.abs(rewards[:, 1]) rewards[:, 2] = np.abs(rewards[:, 2]) rewards[:, 3] = -np.abs(rewards[:, 3]) else: raise ValueError( "You must either supply a path to the test rewards, or a mean reward and " "std from which to sample the test rewards.") out_rewards = append(out_rewards, rewards, flat=True) else: assert out_rewards is not None assert np.all(np.isfinite(out_rewards)) np.save(open(outdir / "test_rewards.npy", "wb"), out_rewards) paths = load(outdir, "optimal_paths.npy", overwrite=overwrite) new_paths_index = paths.shape[0] if paths is not None else 0 num_new_paths = n_rewards - new_paths_index if num_new_paths > 0: new_paths = np.array( Parallel(n_jobs=-2)(delayed(make_opt_traj)(reward) for reward in out_rewards[new_paths_index:])) paths = append(paths, new_paths, flat=True) else: assert paths is not None np.save(open(outdir / "optimal_paths.npy", "wb"), np.array(paths)) gt_alignment = load(outdir, "alignment.npy", overwrite=overwrite) new_gt_index = gt_alignment.size if gt_alignment is not None else 0 if skip_human: exit() for path in paths[new_gt_index:]: env.set_ctrl(path) env.watch(1) alignment = input("Aligned (y/n):").lower() while alignment not in ["y", "n"]: alignment = input("Aligned (y/n):").lower() gt_alignment = append(gt_alignment, alignment == "y") np.save(open(outdir / "alignment.npy", "wb"), gt_alignment)