Esempio n. 1
0
def make_opt_trajs(
    traj_opt: TrajOptimizer,
    rewards: np.ndarray,
    starts: np.ndarray,
    log_time: bool = False,
) -> Tuple[np.ndarray, np.ndarray]:
    trajs = []
    losses = []
    times = []
    for reward, start_state in zip(rewards, starts):
        start = perf_counter()
        traj, loss = traj_opt.make_opt_traj(reward, start_state, return_loss=True)
        stop = perf_counter()

        trajs.append(traj)
        losses.append(loss)

        times.append(stop - start)

    trajs_array = np.array(trajs)
    assert len(trajs_array.shape) == 3
    assert trajs_array.shape[1:] == (50, 2)

    if log_time:
        logging.info(f"Mean traj opt time={np.mean(times)}")
    return trajs_array, np.array(losses)
def align_worker(
        rewards: np.ndarray,
        states: np.ndarray,
        optim: TrajOptimizer,
        action_shape: Tuple[int, ...] = (2, ),
):
    batch_size = rewards.shape[0]
    assert states.shape[0] == batch_size
    plans = np.empty((batch_size, 50, *action_shape))
    for i, (reward, state) in enumerate(zip(rewards, states)):
        traj, _ = optim.make_opt_traj(reward, state)
        plans[i] = traj.reshape(-1, *action_shape)

    return plans
def make_plans(
    rewards: np.ndarray,
    states: np.ndarray,
    optim: TrajOptimizer,
    parallel: Optional[Parallel] = None,
    action_shape: Tuple[int, ...] = (2, ),
    memorize: bool = False,
) -> np.ndarray:

    assert shape_compat(
        rewards,
        (-1, 4)), f"rewards shape={rewards.shape} is wrong, expected (-1, 4)"

    if parallel is not None:
        input_batches = np.array_split(list(product(rewards, states)),
                                       parallel.n_jobs)

        logging.debug("Branching")

        return np.concatenate(
            parallel(
                delayed(align_worker)(
                    rewards=batch[:, 0],
                    states=batch[:, 1],
                    optim=optim,
                    action_shape=action_shape,
                )
                for batch in input_batches)).reshape(len(rewards), len(states),
                                                     50, *action_shape)
    else:
        plans = np.empty((len(rewards), len(states), 50, *action_shape))
        for i, reward in enumerate(rewards):
            assert reward.shape == (4, )
            for j, state in enumerate(states):
                traj, _ = optim.make_opt_traj(reward, state, memorize=memorize)
                plans[i, j] = traj.reshape(-1, *action_shape)
        return plans
def compare(
    reward_path: Path,
    td3_dir: Path,
    outdir: Path,
    planner_iters: int = 10,
    random_start: bool = False,
    n_starts: int = 1,
    replications: Optional[str] = None,
    verbosity: Literal["INFO", "DEBUG"] = "INFO",
):
    logging.basicConfig(level=verbosity, format="%(levelname)s:%(asctime)s:%(message)s")
    if replications is not None:
        replication_indices = parse_replications(replications)
        td3_paths = make_td3_paths(Path(td3_dir), replication_indices)
        for replication, td3_path in zip(replication_indices, td3_paths):
            compare(
                reward_path=Path(reward_path) / str(replication) / "true_reward.npy",
                outdir=Path(outdir) / str(replication),
                td3_dir=td3_path,
                planner_iters=planner_iters,
                random_start=random_start,
                n_starts=n_starts,
                verbosity=verbosity,
            )
        exit()

    reward_weights: np.ndarray = np.load(reward_path).astype(np.float32)
    env = gym.make("LegacyDriver-v1", reward=reward_weights, random_start=random_start)
    td3 = load_td3(env, td3_dir)

    traj_optimizer = TrajOptimizer(planner_iters)

    class BadPlannerCollection:
        def __init__(self):
            self.states = None
            self.rewards = None
            self.trajs = None

        def append(
            self, state: np.ndarray, reward: np.ndarray, traj: np.ndarray
        ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
            if self.states is None:
                self.states = np.array([state])
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.array([reward])
                self.trajs = np.array([traj])
            else:
                self.states = np.append(self.states, [state], axis=0)
                logging.debug(f"state shape={state.shape}, states shapes={self.states.shape}")
                self.rewards = np.append(self.rewards, [reward], axis=0)
                self.trajs = np.append(self.trajs, [traj], axis=0)

            self.check_shapes()

            return self.get()

        def check_shapes(self):
            assert len(self.states.shape) == 3
            assert len(self.rewards.shape) == 2
            assert len(self.trajs.shape) == 3

            assert self.states.shape[1:] == (2, 4)
            assert self.rewards.shape[1] == 4
            assert self.trajs.shape[1:] == (50, 2)

        def get(self) -> Tuple[Optional[np.ndarray], Optional[np.ndarray], Optional[np.ndarray]]:
            return self.states, self.rewards, self.trajs

    planner_bad = BadPlannerCollection()

    returns = np.empty((n_starts, 2))
    for i in range(n_starts):
        logging.info(f"{i+1}/{n_starts}")
        start_state: np.ndarray = env.reset()

        logging.info("Optimizing traj")
        opt_traj = traj_optimizer.make_opt_traj(reward_weights, start_state)

        logging.info("Executing traj")
        opt_return = 0.0
        for action in opt_traj:
            state, reward, done, info = env.step(action)
            opt_return += reward

        opt_return = opt_return / len(opt_traj)

        logging.info("Evaluating policy")
        empirical_return, traj = eval(
            reward_weights=reward_weights,
            td3=td3,
            start_state=start_state,
            time_in_state=False,
            return_actions=True,
        )

        returns[i] = empirical_return, opt_return

        if opt_return < empirical_return:
            planner_bad.append(start_state, reward_weights, traj)

    outdir.mkdir(parents=True, exist_ok=True)
    plot_dir = outdir / "comparison_plots"
    plot_dir.mkdir(parents=True, exist_ok=True)

    plt.hist(returns[:, 0], label="Empirical", alpha=0.5)
    plt.hist(returns[:, 1], label="Optimal", alpha=0.5)
    plt.title("Histogram of Optimal vs Empirical returns")
    plt.legend()
    plt.savefig(plot_dir / "returns.png")
    plt.close()

    regret = returns[:, 1] - returns[:, 0]
    plt.hist(regret)
    plt.title("Histogram of regret")
    plt.savefig(plot_dir / "regret.png")
    plt.close()
    logging.info(f"Average regret = {np.mean(regret)}, min={np.min(regret)}, max={np.max(regret)}")

    pickle.dump(planner_bad.get(), (outdir / "planner_mistakes.pkl").open("wb"))