def get_simulated_feedback( simulation: Driver, input_A: np.ndarray, input_B: np.ndarray, query_type: str, true_reward: np.ndarray, delta: Optional[float] = None, ) -> Tuple[np.ndarray, np.ndarray, int]: """ Gets preference between trajectories from an agent simulated by true_reward """ simulation.feed(input_A) phi_A = np.array(simulation.get_features()) simulation.feed(input_B) phi_B = np.array(simulation.get_features()) if query_type == "weak": # TODO(joschnei): Implement weak errors using delta. I think there's a model for this but I can't remember off hand. raise NotImplementedError( "Simulated weak preferences not implemented.") if delta is None: raise ValueError("Must provide delta when using weak queries.") elif query_type == "strict": s = 1 if true_reward @ (phi_A - phi_B) > 0 else -1 else: raise ValueError( f'query type {query_type} must be either "strict" or "weak"') return phi_A, phi_B, s
def make_normals(inputs: np.ndarray, sim: Driver, use_equiv: bool) -> Tuple[np.ndarray, np.ndarray]: """Converts pairs of car inputs to trajectory preference normal vectors. Args: inputs (np.ndarray): (n, 2, T, 2) array of pairs of 2-dimension actions for T timesteps sim (Driver): Driving simulation to get features from use_equiv (bool): Allow equivalent preferences? Returns: Tuple[np.ndarray, np.ndarray]: input features and normal vectors """ if len(inputs.shape) == 3: shape_compat(inputs, (-1, 2, -1)) elif len(inputs.shape) == 4: shape_compat(inputs, (-1, 2, -1, 2)) normals = np.empty(shape=(inputs.shape[0], sim.num_of_features)) input_features = np.empty(shape=(inputs.shape[0], 2, sim.num_of_features)) for i, (input_a, input_b) in enumerate(inputs): sim.feed(input_a) phi_a = np.array(sim.get_features()) sim.feed(input_b) phi_b = np.array(sim.get_features()) input_features[i] = np.stack((phi_a, phi_b)) normals[i] = phi_a - phi_b assert_normals(normals, use_equiv) return input_features, normals
def main() -> None: reward_weights = np.ones(4) sim = Driver() env = LegacyEnv(reward_weights) plans = make_actions(1000) returns = [] start = perf_counter() for plan in plans: sim.feed(plan) features = sim.get_features() returns.append(reward_weights @ features) stop = perf_counter() print(f"Legacy env took {(stop - start) / len(plans)} seconds on average") # Driver env is a lot faster for rollouts returns = [] start = perf_counter() for plan in plans: env.reset() plan_return = 0.0 for action in plan: _, reward, _, _ = env.step(action) plan_return += reward returns.append(plan_return) stop = perf_counter() print(f"tf env took {(stop - start) / len(plans)} seconds on average")