def legacy_make_test_rewards( n_questions: int, n_rewards: int, true_reward: np.ndarray, epsilons: List[float], use_equiv: bool, ) -> Dict[float, Tuple[np.ndarray, np.ndarray]]: """ Generates n_rewards reward vectors and determines which are aligned. """ assert n_rewards > 0 assert_reward(true_reward, use_equiv) trajs = make_random_questions(n_questions, Driver()) _, normals = make_normals(trajs, Driver(), use_equiv) gt_pref = true_reward @ normals.T > 0 normals = orient_normals(normals, gt_pref, use_equiv) assert_normals(normals, use_equiv) n_reward_features = normals.shape[1] test_rewards: Dict[float, Tuple[np.ndarray, np.ndarray]] = {} for epsilon in epsilons: assert epsilon >= 0.0 cov = 1.0 rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast(np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) while mean_agree > 0.55 or mean_agree < 0.45: if mean_agree > 0.55: cov *= 1.1 else: cov /= 1.1 if not np.isfinite(cov) or cov <= 0.0 or cov >= 100.0: # TODO(joschnei): Break is a code smell logging.warning( f"cov={cov}, using last good batch of rewards.") break rewards = make_gaussian_rewards(n_rewards, use_equiv, mean=true_reward, cov=cov) normals = normals[true_reward @ normals.T > epsilon] ground_truth_alignment = cast( np.ndarray, np.all(rewards @ normals.T > 0, axis=1)) mean_agree = np.mean(ground_truth_alignment) assert ground_truth_alignment.shape == (n_rewards, ) assert rewards.shape == (n_rewards, n_reward_features) test_rewards[epsilon] = (rewards, ground_truth_alignment) return test_rewards
def main() -> None: reward_weights = np.ones(4) sim = Driver() env = LegacyEnv(reward_weights) plans = make_actions(1000) returns = [] start = perf_counter() for plan in plans: sim.feed(plan) features = sim.get_features() returns.append(reward_weights @ features) stop = perf_counter() print(f"Legacy env took {(stop - start) / len(plans)} seconds on average") # Driver env is a lot faster for rollouts returns = [] start = perf_counter() for plan in plans: env.reset() plan_return = 0.0 for action in plan: _, reward, _, _ = env.step(action) plan_return += reward returns.append(plan_return) stop = perf_counter() print(f"tf env took {(stop - start) / len(plans)} seconds on average")
def play(sim: Driver, optimal_ctrl): """ Renders trajectory for user. """ sim.set_ctrl(optimal_ctrl) keep_playing = "y" while keep_playing == "y": keep_playing = "u" sim.watch(1) while keep_playing != "n" and keep_playing != "y": keep_playing = input("Again? [y/n]: ").lower() return optimal_ctrl
def test_get_simulated_feedback(actions: np.ndarray, reward: np.ndarray): feature_1, feature_2, pref = get_simulated_feedback( Driver(), actions[0], actions[1], "strict", reward ) expected_pref = (reward @ (feature_1 - feature_2) > 0) * 2 - 1 assert expected_pref == pref
def test_orient_normals(actions: np.ndarray, reward: np.ndarray): reward = safe_normalize(reward) _, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) value_diffs = reward @ normals.T prefs = value_diffs > 0 oriented_normals = orient_normals(normals, preferences=prefs) assert_normals(oriented_normals) assert np.all(reward @ oriented_normals.T == np.abs(value_diffs))
def make_normals(inputs: np.ndarray, sim: Driver, use_equiv: bool) -> Tuple[np.ndarray, np.ndarray]: """Converts pairs of car inputs to trajectory preference normal vectors. Args: inputs (np.ndarray): (n, 2, T, 2) array of pairs of 2-dimension actions for T timesteps sim (Driver): Driving simulation to get features from use_equiv (bool): Allow equivalent preferences? Returns: Tuple[np.ndarray, np.ndarray]: input features and normal vectors """ if len(inputs.shape) == 3: shape_compat(inputs, (-1, 2, -1)) elif len(inputs.shape) == 4: shape_compat(inputs, (-1, 2, -1, 2)) normals = np.empty(shape=(inputs.shape[0], sim.num_of_features)) input_features = np.empty(shape=(inputs.shape[0], 2, sim.num_of_features)) for i, (input_a, input_b) in enumerate(inputs): sim.feed(input_a) phi_a = np.array(sim.get_features()) sim.feed(input_b) phi_b = np.array(sim.get_features()) input_features[i] = np.stack((phi_a, phi_b)) normals[i] = phi_a - phi_b assert_normals(normals, use_equiv) return input_features, normals
def get_simulated_feedback( simulation: Driver, input_A: np.ndarray, input_B: np.ndarray, query_type: str, true_reward: np.ndarray, delta: Optional[float] = None, ) -> Tuple[np.ndarray, np.ndarray, int]: """ Gets preference between trajectories from an agent simulated by true_reward """ simulation.feed(input_A) phi_A = np.array(simulation.get_features()) simulation.feed(input_B) phi_B = np.array(simulation.get_features()) if query_type == "weak": # TODO(joschnei): Implement weak errors using delta. I think there's a model for this but I can't remember off hand. raise NotImplementedError( "Simulated weak preferences not implemented.") if delta is None: raise ValueError("Must provide delta when using weak queries.") elif query_type == "strict": s = 1 if true_reward @ (phi_A - phi_B) > 0 else -1 else: raise ValueError( f'query type {query_type} must be either "strict" or "weak"') return phi_A, phi_B, s
def main(datadir: Path) -> None: logging.basicConfig(level="INFO") datadir = Path(datadir) flags = pickle.load(open(datadir / "flags.pkl", "rb")) use_equiv = False sim = Driver() n_reward_features = sim.num_of_features inputs = np.load(datadir / "inputs.npy") n_questions = inputs.shape[0] assert inputs.shape[1] == 2 input_features = np.load(datadir / "input_features.npy") n_questions = input_features.shape[0] assert input_features.shape == (n_questions, 2, n_reward_features), input_features.shape assert_input_feature_consistency(inputs, input_features, sim) normals = np.load(datadir / "normals.npy") logging.info(f"There are {normals.shape[0]} questions") assert_normals(normals, use_equiv, n_reward_features) assert_normal_consistency(input_features, normals) preferences = np.load(datadir / "preferences.npy") assert preferences.shape == (n_questions,) assert np.all((preferences == 1) | (preferences == -1)) oriented_normals = orient_normals(normals, preferences) if (datadir / "true_reward.npy").exists(): true_reward = np.load(datadir / "true_reward.npy") assert_reward(true_reward, use_equiv, n_reward_features) logging.info(f"true_reward={true_reward}") assert_true_reward_consistency(oriented_normals, true_reward) if (datadir / "mean_reward.npy").exists(): mean_reward = np.load(datadir / "mean_reward.npy") logging.info(f"mean_reward={mean_reward}") assert_reward(mean_reward, use_equiv, n_reward_features) mean_accuracy = np.mean(oriented_normals @ mean_reward > 0) logging.info(f"Accuracy of mean reward function is {mean_accuracy}")
def make_gt_test_align( test_rewards: np.ndarray, n_questions: int, true_reward: np.ndarray, epsilon: float, use_equiv: bool = False, ) -> np.ndarray: env = Driver() trajs = make_random_questions(n_questions, env) _, normals = make_normals(trajs, env, use_equiv) value_diff = true_reward @ normals.T eps_questions = np.abs(value_diff) > epsilon normals = normals[eps_questions] gt_pref = value_diff[eps_questions] > 0 normals = orient_normals(normals, gt_pref, use_equiv) alignment = cast(np.ndarray, np.all(test_rewards @ normals.T > 0, axis=1)) assert alignment.shape == ( test_rewards.shape[0], ), f"alignment shape={alignment.shape} is not expected {test_rewards.shape[0]}" return alignment
def human( epsilons: List[float] = [0.0], deltas: List[float] = [0.05], n_rewards: int = 10000, human_samples: List[int] = [1], n_model_samples: int = 1000, input_features_name: Path = Path("input_features.npy"), normals_name: Path = Path("normals.npy"), preferences_name: Path = Path("preferences.npy"), flags_name: Path = Path("flags.pkl"), datadir: Path = Path("questions"), outdir: Path = Path("questions"), rewards_path: Optional[Path] = None, use_mean_reward: bool = False, skip_remove_duplicates: bool = False, skip_epsilon_filtering: bool = False, skip_redundancy_filtering: bool = False, n_cpus: int = 1, overwrite: bool = False, ): """ Evaluates alignment test elicited from a human. """ outdir.mkdir(parents=True, exist_ok=True) parallel = Parallel(n_jobs=n_cpus) flags = pkl.load(open(datadir / flags_name, "rb")) query_type = flags["query_type"] equiv_probability = flags["equiv_size"] sim = Driver() n_reward_features = sim.num_of_features elicited_normals, elicited_preferences, elicited_input_features = load_elicitation( datadir=datadir, normals_name=normals_name, preferences_name=preferences_name, input_features_name=input_features_name, n_reward_features=n_reward_features, use_equiv=use_equiv, query_type=query_type, equiv_probability=equiv_probability, ) assert elicited_preferences.shape[0] > 0 factory = TestFactory( query_type=query_type, reward_dimension=elicited_normals.shape[1], equiv_probability=equiv_probability, n_reward_samples=n_model_samples, use_mean_reward=use_mean_reward, skip_dedup=skip_remove_duplicates, skip_noise_filtering=True, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, ) test_path = outdir / make_outname( skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, base="indices", ) test_results_path = outdir / make_outname( skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, base="test_results", ) minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite) results: Dict[Experiment, np.ndarray] = load(test_results_path, overwrite) test_rewards = (np.load(open(rewards_path, "rb")) if rewards_path is not None else make_gaussian_rewards( n_rewards, use_equiv)) np.save(outdir / "test_rewards.npy", test_rewards) experiments = make_experiments(epsilons, deltas, human_samples, overwrite, experiments=set(minimal_tests.keys())) for indices, result, experiment in parallel( delayed(run_human_experiment)( test_rewards, elicited_normals, elicited_input_features, elicited_preferences, epsilon, delta, n, factory, use_equiv, ) for epsilon, delta, n in experiments): minimal_tests[experiment] = indices results[experiment] = result pkl.dump(minimal_tests, open(test_path, "wb")) pkl.dump(results, open(test_results_path, "wb"))
def simulated( epsilons: List[float] = [0.0], n_rewards: int = 100, human_samples: List[int] = [1], n_reward_samples: int = 1000, n_test_states: Optional[int] = None, n_gt_test_questions: int = 10000, traj_opt: bool = False, datadir: Path = Path(), outdir: Path = Path(), deltas: List[Optional[float]] = [None], use_mean_reward: bool = False, use_random_test_questions: bool = False, n_random_test_questions: Optional[int] = None, use_cheating_questions: bool = False, skip_remove_duplicates: bool = False, skip_epsilon_filtering: bool = False, skip_redundancy_filtering: bool = False, use_true_epsilon: bool = False, legacy_test_rewards: bool = False, replications: Optional[Union[str, Tuple[int, ...]]] = None, n_cpus: int = 1, overwrite_test_rewards: bool = False, overwrite_results: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: """ Evaluates alignment test generated by ground-truth rewards. """ logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s") if replications is not None: replication_indices = parse_replications(replications) for replication in replication_indices: if not (datadir / str(replication)).exists(): logging.warning( f"Replication {replication} does not exist, skipping") continue logging.info(f"Starting replication {replication}") simulated( epsilons=epsilons, deltas=deltas, n_rewards=n_rewards, human_samples=human_samples, n_reward_samples=n_reward_samples, n_test_states=n_test_states, n_gt_test_questions=n_gt_test_questions, datadir=datadir / str(replication), outdir=outdir / str(replication), use_mean_reward=use_mean_reward, use_random_test_questions=use_random_test_questions, use_cheating_questions=use_cheating_questions, n_random_test_questions=n_random_test_questions, skip_remove_duplicates=skip_remove_duplicates, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, legacy_test_rewards=legacy_test_rewards, n_cpus=n_cpus, overwrite_test_rewards=overwrite_test_rewards, overwrite_results=overwrite_results, verbosity=verbosity, ) exit() logging.info(f"Using {n_cpus} cpus.") parallel = Parallel(n_jobs=n_cpus) outdir.mkdir(parents=True, exist_ok=True) if n_random_test_questions is not None: # Argh defaults to parsing something as a string if its optional n_random_test_questions = int(n_random_test_questions) flags = pkl.load(open(datadir / flags_name, "rb")) query_type = flags["query_type"] equiv_probability = flags["equiv_size"] env = Driver() n_reward_features = env.num_of_features logging.info("Loading elicitation results") elicited_normals, elicited_preferences, elicited_input_features = load_elicitation( datadir=datadir, normals_name=normals_name, preferences_name=preferences_name, input_features_name=input_features_name, n_reward_features=n_reward_features, use_equiv=use_equiv, query_type=query_type, equiv_probability=equiv_probability, ) true_reward = np.load(datadir / true_reward_name) assert_reward(true_reward, False, n_reward_features) if use_equiv: true_reward = np.append(true_reward, [1]) else: assert not np.any(elicited_preferences == 0) factory = TestFactory( query_type=query_type, reward_dimension=elicited_normals.shape[1], equiv_probability=equiv_probability, n_reward_samples=n_reward_samples, use_mean_reward=use_mean_reward, skip_dedup=skip_remove_duplicates, skip_noise_filtering=True, skip_epsilon_filtering=skip_epsilon_filtering, skip_redundancy_filtering=skip_redundancy_filtering, use_true_epsilon=use_true_epsilon, true_reward=true_reward, ) logging.info(f"""Filtering settings: # reward samples={n_reward_samples}, use mean reward={use_mean_reward}, skip duplicates={skip_remove_duplicates} skip noise={True} skip epsilon={skip_epsilon_filtering} skip redundancy={skip_redundancy_filtering} use true epsilon={use_true_epsilon} """) confusion_path, test_path = make_outnames( outdir, skip_remove_duplicates, True, skip_epsilon_filtering, skip_redundancy_filtering, ) confusions: Dict[Experiment, np.ndarray] = load(confusion_path, overwrite_results, default={}) minimal_tests: Dict[Experiment, np.ndarray] = load(test_path, overwrite_results, default={}) experiments = make_experiments(epsilons, deltas, human_samples, overwrite_results, experiments=set(minimal_tests.keys())) if use_random_test_questions: logging.info("Making random test") logging.info(f"True reward: {true_reward}") normals, preferences, input_features = make_random_test( n_random_test_questions, elicited_input_features, elicited_preferences, reward_iterations=flags["reward_iterations"], query_type=query_type, equiv_size=flags["equiv_size"], sim=env, use_equiv=use_equiv, ) good_indices = (true_reward @ normals.T) > 0 logging.info( f"{np.mean(good_indices)*100:2f}% of new test questions agree with gt reward." ) if use_cheating_questions: logging.info(f"Selecting only questions consistent with gt reward") normals = normals[good_indices] preferences = preferences[good_indices] input_features = input_features[good_indices] assert_normals(normals, use_equiv) else: max_n = max(human_samples) preferences = elicited_preferences[:max_n] input_features = elicited_input_features[:max_n] logging.debug(f"elicited_normals={elicited_normals[:10]}") normals = orient_normals(elicited_normals[:max_n], preferences, use_equiv, n_reward_features) logging.debug(f"normals={normals[:10]}") assert np.all(true_reward @ normals.T >= 0) if not legacy_test_rewards: test_rewards = make_test_rewards( epsilons=epsilons, true_reward=true_reward, n_rewards=n_rewards, n_test_states=n_test_states, n_gt_test_questions=int(n_gt_test_questions), traj_opt=traj_opt, outdir=outdir, parallel=parallel, use_equiv=use_equiv, overwrite=overwrite_test_rewards, ) else: test_rewards = legacy_make_test_rewards(1000, n_rewards, true_reward, epsilons, use_equiv) for indices, confusion, experiment in parallel( delayed(run_gt_experiment)( normals=normals, test_rewards=test_rewards[epsilon][0], test_reward_alignment=test_rewards[epsilon][1], epsilon=epsilon, delta=delta, use_equiv=use_equiv, n_human_samples=n, factory=factory, input_features=input_features, preferences=preferences, outdir=outdir, verbosity=verbosity, ) for epsilon, delta, n in experiments): minimal_tests[experiment] = indices confusions[experiment] = confusion pkl.dump(confusions, open(confusion_path, "wb")) pkl.dump(minimal_tests, open(test_path, "wb"))
def simulated( outdir: Path, criterion: Literal["information", "volume", "random"], termination_threshold: float, n_reward_samples: int, query_type: Literal["strict", "weak"] = "strict", equiv_size: Optional[float] = None, true_reward_path: Optional[Path] = None, continuous: bool = False, overwrite: bool = False, replicaitons: Optional[str] = None, ): """ Generates a test by eliciting from a human simulated by a ground truth reward. """ if replicaitons is not None: replication_indices = parse_replications(replicaitons) if true_reward_path is not None: reward_dir, reward_name = make_reward_path(true_reward_path) Parallel(n_jobs=-2)( delayed(simulated)( outdir=Path(outdir) / str(i), criterion=criterion, termination_threshold=termination_threshold, n_reward_smaples=n_reward_samples, query_type=query_type, equiv_size=equiv_size, true_reward_path=reward_dir / str(i) / reward_name, continuous=continuous, overwrite=overwrite, ) for i in replication_indices ) else: Parallel(n_jobs=-2)( delayed(simulated)( outdir=Path(outdir) / str(i), criterion=criterion, termination_threshold=termination_threshold, n_reward_smaples=n_reward_samples, query_type=query_type, equiv_size=equiv_size, continuous=continuous, overwrite=overwrite, ) for i in replication_indices ) exit() criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size) env = Driver() d = env.num_of_features if true_reward_path is not None: logging.info(f"Loading true reward from {true_reward_path}") true_reward = np.load(true_reward_path) else: logging.info("Randomly generating true reward") true_reward = np.random.normal(size=(4,)) true_reward = true_reward / np.linalg.norm(true_reward) np.save(outdir / "true_reward.npy", true_reward) pickle.dump( { "criterion": criterion, "reward_iterations": n_reward_samples, "stop_thresh": termination_threshold, "query_type": query_type, "equiv_size": equiv_size, "continuous": continuous, }, open(outdir / "flags.pkl", "wb"), ) normals = load(outdir / "normals.npy", overwrite=overwrite) preferences = load(outdir / "preferences.npy", overwrite=overwrite) inputs = load(outdir / "inputs.npy", overwrite=overwrite) input_features = load(outdir / "input_features.npy", overwrite=overwrite) # If there is already data, feed it to the w_sampler to get the right posterior. w_sampler = Sampler(d) if inputs is not None and input_features is not None and preferences is not None: for (a_phi, b_phi), preference in zip(input_features, preferences): w_sampler.feed(a_phi, b_phi, [preference]) score = np.inf try: while score >= termination_threshold: w_samples, delta_samples = w_sampler.sample_given_delta( sample_count=n_reward_samples, query_type=query_type, delta=equiv_size ) input_A, input_B, score = run_algo(criterion, env, w_samples, delta_samples, continuous) logging.info(f"Score={score}") if score > termination_threshold: inputs = update_inputs( a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir ) phi_A, phi_B, preference = get_simulated_feedback( simulation=env, input_A=input_A, input_B=input_B, query_type=query_type, true_reward=true_reward, delta=equiv_size, ) input_features = append(input_features, np.stack([phi_A, phi_B])) normals = append(normals, phi_A - phi_B) preferences = append(preferences, preference) np.save(outdir / "input_features.npy", input_features) np.save(outdir / "normals.npy", normals) np.save(outdir / "preferences.npy", preferences) w_sampler.feed(phi_A, phi_B, [preference]) except KeyboardInterrupt: # Pass through to finally logging.warning("\nSaving results, please do not exit again.") finally: save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def human( criterion: str, query_type: str, epsilon: float, n_reward_samples: int, equiv_size: float, outdir: Path = Path("questions"), continuous: bool = False, overwrite: bool = False, ): """ Generates a test by eliciting preferences from a human. """ criterion, query_type, outdir = setup(criterion, query_type, outdir, delta=equiv_size) simulation_object = Driver() d = simulation_object.num_of_features pickle.dump( { "criterion": criterion, "query_type": query_type, "epsilon": epsilon, "reward_iterations": n_reward_samples, "delta": equiv_size, "continuous": continuous, }, open(outdir / "flags.pkl", "wb"), ) normals = load(outdir / "normals.npy", overwrite=overwrite) preferences = load(outdir / "preferences.npy", overwrite=overwrite) inputs = load(outdir / "inputs.npy", overwrite=overwrite) input_features = load(outdir / "input_features.npy", overwrite=overwrite) w_sampler = Sampler(d) if inputs is not None and input_features is not None and preferences is not None: for (a_phi, b_phi), preference in zip(input_features, preferences): w_sampler.feed(a_phi, b_phi, [preference]) score = np.inf try: while score >= epsilon: w_samples, delta_samples = w_sampler.sample_given_delta( n_reward_samples, query_type, equiv_size ) input_A, input_B, score = run_algo( criterion, simulation_object, w_samples, delta_samples, continuous ) if score > epsilon: inputs = update_inputs( a_inputs=input_A, b_inputs=input_B, inputs=inputs, outdir=outdir ) phi_A, phi_B, preference = get_feedback( simulation_object, input_A, input_B, query_type ) input_features = append(input_features, np.stack([phi_A, phi_B])) normals = append(normals, phi_A - phi_B) preferences = append(preferences, preference) np.save(outdir / "input_features.npy", input_features) np.save(outdir / "normals.npy", normals) np.save(outdir / "preferences.npy", preferences) w_sampler.feed(phi_A, phi_B, [preference]) except KeyboardInterrupt: # Pass through to finally logging.warning("\nSaving results, please do not exit again.") finally: save_reward(query_type, w_sampler, n_reward_samples, outdir, true_delta=equiv_size)
def test_make_normals(actions: np.ndarray): features, normals = make_normals(inputs=actions, sim=Driver(), use_equiv=False) assert np.all((features[0][0] - features[0][1]) == normals) assert_normals(normals)
def main( n_questions: int, query_type: Literal["strict", "weak"] = "strict", equiv_size: float = 1.1, reward_iterations: int = 100, outdir: Path = Path("data/simulated/random/elicitation"), human: bool = False, reward_path: Optional[Path] = None, replications: Optional[str] = None, overwrite: bool = False, verbosity: Literal["INFO", "DEBUG"] = "INFO", ) -> None: outpath = Path(outdir) outpath.mkdir(parents=True, exist_ok=True) setup_logging(verbosity=verbosity, log_path=outpath / "log.txt") if not human: assert reward_path is not None reward_dir, reward_name = make_reward_path(reward_path) reward_path = reward_dir / reward_name if replications is not None: replication_indices = parse_replications(replications) n_cpus = min(multiprocessing.cpu_count() - 4, len(replication_indices)) Parallel(n_jobs=n_cpus)( delayed(main)( n_questions=n_questions, query_type=query_type, equiv_size=equiv_size, reward_iterations=reward_iterations, outdir=outpath / str(i), human=human, reward_path=reward_dir / str(i) / reward_name, overwrite=overwrite, verbosity=verbosity, ) for i in replication_indices ) exit() if not human: assert reward_path is not None if not reward_path.exists(): logging.warning("Reward path given does not exist, generating random reward.") true_reward = np.random.default_rng().normal(loc=0, scale=1, size=(4,)) true_reward = safe_normalize(true_reward) np.save(reward_path, true_reward) else: true_reward = np.load(reward_path) pickle.dump( { "n_questions": n_questions, "query_type": query_type, "equiv_size": equiv_size, "reward_iterations": reward_iterations, "human": human, }, open(outpath / "flags.pkl", "wb"), ) normals = load(outpath / "normals.npy", overwrite=overwrite) preferences = load(outpath / "preferences.npy", overwrite=overwrite) # TODO(joschnei): Make class for inputs, dimensions are too difficult to reason about # (N, 2, 100) inputs = load(outpath / "inputs.npy", overwrite=overwrite) input_features = load(outpath / "input_features.npy", overwrite=overwrite) env = Driver() if ( inputs is not None and input_features is not None and inputs.shape[0] > input_features.shape[0] ): logging.info("Catching up to previously generated trajectories.") input_A, input_B = inputs[-1] if human: phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type) else: phi_A, phi_B, preference = get_simulated_feedback( env, input_A, input_B, query_type, true_reward, equiv_size ) input_features, normals, preferences = update_response( input_features, normals, preferences, phi_A, phi_B, preference, outpath ) # Questions and inputs are duplicated, but this keeps everything consistent for the hot-load case new_questions = n_questions - inputs.shape[0] if inputs is not None else n_questions questions = make_random_questions(n_questions=new_questions, env=env) logging.debug(f"questions={questions[:10]}") if inputs is not None: assert input_features is not None assert normals is not None assert preferences is not None assert inputs.shape[0] == input_features.shape[0] assert inputs.shape[0] == normals.shape[0] assert inputs.shape[0] == preferences.shape[0] for input_A, input_B in questions: inputs = update_inputs(input_A, input_B, inputs, outpath) if inputs.shape[0] % 10 == 0: logging.info(f"{inputs.shape[0]} of {n_questions}") if human: phi_A, phi_B, preference = get_feedback(env, input_A, input_B, query_type) else: phi_A, phi_B, preference = get_simulated_feedback( env, input_A, input_B, query_type, true_reward, equiv_size ) input_features, normals, preferences = update_response( input_features, normals, preferences, phi_A, phi_B, preference, outpath ) save_reward( query_type=query_type, true_delta=equiv_size, w_sampler=Sampler(env.num_of_features), n_reward_samples=reward_iterations, outdir=outpath, )
def collect( outdir: Path, n_rewards: int, test_reward_path: Optional[Path] = None, std: Optional[float] = None, mean_reward_path: Optional[Path] = None, normals_paths: Optional[List[Path]] = None, preferences_paths: Optional[List[Path]] = None, use_random: bool = False, use_plausible: bool = False, skip_human: bool = False, overwrite: bool = False, ) -> None: """Collects ground truth labels for the optimal trajectories of some reward functions. Args: outdir (Path): Directory to write output to n_rewards (int): Number of rewards to generate or process test_reward_path (Optional[Path], optional): Path to nupmy array of reward weights to test. Defaults to None. std (Optional[float], optional): Standard deviation of normal distribution to draw test reward weigths from. Defaults to None. mean_reward_path (Optional[Path], optional): Path to numpy array specifying mean reward weights to sample around. Defaults to None. overwrite (bool, optional): Overwrite output? Defaults to False. Raises: ValueError: Raised if neither test_reward_path or both std and mean_reward_path are specified. The test rewards need to come from somewhere. """ outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) out_rewards = load(outdir, "test_rewards.npy", overwrite=overwrite) new_rewards_index = out_rewards.shape[0] if out_rewards is not None else 0 num_new_rewards = n_rewards - new_rewards_index env = Driver() if num_new_rewards > 0: if test_reward_path is not None: rewards = np.load( test_reward_path)[new_rewards_index:num_new_rewards] elif mean_reward_path is not None and std is not None: mean_reward = np.load(mean_reward_path) rewards = default_rng().normal(loc=mean_reward, scale=std, size=(num_new_rewards, *mean_reward.shape)) elif normals_paths is not None and preferences_paths is not None and std is not None: # NOTE(joschnei): This turned out not to work, because the random baseline is poisoning the well normals = None for normals_path, preferences_path in zip(normals_paths, preferences_paths): single_normals = np.load(normals_path) single_preferences = np.load(preferences_path) single_normals = (single_normals.T * single_preferences).T normals = append(normals, single_normals, flat=True) # TODO(joschnei): These can all be loaded in from flags.pkl, but I'm too lazy for that. mean_reward = make_mode_reward( query_type="strict", true_delta=1.1, w_sampler=Sampler(env.num_of_features), n_reward_samples=100, ) assert np.all(np.isfinite(mean_reward)) rewards = default_rng().normal(loc=mean_reward, scale=std, size=(num_new_rewards, *mean_reward.shape)) assert np.all(np.isfinite(rewards)) elif use_random: rewards = default_rng().normal(loc=0, scale=1, size=(num_new_rewards, env.num_of_features)) rewards = rewards / np.linalg.norm(rewards) elif use_plausible: # Generate uniform rewards with plausible weights i.e. ones with the right sign rewards = default_rng().normal(loc=0, scale=1, size=(num_new_rewards, env.num_of_features)) rewards = rewards / np.linalg.norm(rewards) # See models.py for reward feature details. rewards[:, 0] = np.abs(rewards[:, 0]) rewards[:, 1] = -np.abs(rewards[:, 1]) rewards[:, 2] = np.abs(rewards[:, 2]) rewards[:, 3] = -np.abs(rewards[:, 3]) else: raise ValueError( "You must either supply a path to the test rewards, or a mean reward and " "std from which to sample the test rewards.") out_rewards = append(out_rewards, rewards, flat=True) else: assert out_rewards is not None assert np.all(np.isfinite(out_rewards)) np.save(open(outdir / "test_rewards.npy", "wb"), out_rewards) paths = load(outdir, "optimal_paths.npy", overwrite=overwrite) new_paths_index = paths.shape[0] if paths is not None else 0 num_new_paths = n_rewards - new_paths_index if num_new_paths > 0: new_paths = np.array( Parallel(n_jobs=-2)(delayed(make_opt_traj)(reward) for reward in out_rewards[new_paths_index:])) paths = append(paths, new_paths, flat=True) else: assert paths is not None np.save(open(outdir / "optimal_paths.npy", "wb"), np.array(paths)) gt_alignment = load(outdir, "alignment.npy", overwrite=overwrite) new_gt_index = gt_alignment.size if gt_alignment is not None else 0 if skip_human: exit() for path in paths[new_gt_index:]: env.set_ctrl(path) env.watch(1) alignment = input("Aligned (y/n):").lower() while alignment not in ["y", "n"]: alignment = input("Aligned (y/n):").lower() gt_alignment = append(gt_alignment, alignment == "y") np.save(open(outdir / "alignment.npy", "wb"), gt_alignment)