def get_simulated_feedback(
    simulation: Driver,
    input_A: np.ndarray,
    input_B: np.ndarray,
    query_type: str,
    true_reward: np.ndarray,
    delta: Optional[float] = None,
) -> Tuple[np.ndarray, np.ndarray, int]:
    """ Gets preference between trajectories from an agent simulated by true_reward """
    simulation.feed(input_A)
    phi_A = np.array(simulation.get_features())
    simulation.feed(input_B)
    phi_B = np.array(simulation.get_features())
    if query_type == "weak":
        # TODO(joschnei): Implement weak errors using delta. I think there's a model for this but I can't remember off hand.
        raise NotImplementedError(
            "Simulated weak preferences not implemented.")
        if delta is None:
            raise ValueError("Must provide delta when using weak queries.")
    elif query_type == "strict":
        s = 1 if true_reward @ (phi_A - phi_B) > 0 else -1
    else:
        raise ValueError(
            f'query type {query_type} must be either "strict" or "weak"')
    return phi_A, phi_B, s
def make_normals(inputs: np.ndarray, sim: Driver,
                 use_equiv: bool) -> Tuple[np.ndarray, np.ndarray]:
    """Converts pairs of car inputs to trajectory preference normal vectors.

    Args:
        inputs (np.ndarray): (n, 2, T, 2) array of pairs of 2-dimension actions for T timesteps
        sim (Driver): Driving simulation to get features from
        use_equiv (bool): Allow equivalent preferences?

    Returns:
        Tuple[np.ndarray, np.ndarray]: input features and normal vectors
    """
    if len(inputs.shape) == 3:
        shape_compat(inputs, (-1, 2, -1))
    elif len(inputs.shape) == 4:
        shape_compat(inputs, (-1, 2, -1, 2))

    normals = np.empty(shape=(inputs.shape[0], sim.num_of_features))
    input_features = np.empty(shape=(inputs.shape[0], 2, sim.num_of_features))
    for i, (input_a, input_b) in enumerate(inputs):
        sim.feed(input_a)
        phi_a = np.array(sim.get_features())

        sim.feed(input_b)
        phi_b = np.array(sim.get_features())

        input_features[i] = np.stack((phi_a, phi_b))

        normals[i] = phi_a - phi_b
    assert_normals(normals, use_equiv)
    return input_features, normals
Exemple #3
0
def main() -> None:
    reward_weights = np.ones(4)
    sim = Driver()
    env = LegacyEnv(reward_weights)

    plans = make_actions(1000)

    returns = []
    start = perf_counter()
    for plan in plans:
        sim.feed(plan)
        features = sim.get_features()
        returns.append(reward_weights @ features)
    stop = perf_counter()
    print(f"Legacy env took {(stop - start) / len(plans)} seconds on average")
    # Driver env is a lot faster for rollouts

    returns = []
    start = perf_counter()
    for plan in plans:
        env.reset()
        plan_return = 0.0
        for action in plan:
            _, reward, _, _ = env.step(action)
            plan_return += reward
        returns.append(plan_return)
    stop = perf_counter()
    print(f"tf env took {(stop - start) / len(plans)} seconds on average")