def _estimate_value(self):
        tgt_generator = PolicyLogGenerator(self._env, self._policy)
        log = {}
        for state in self._env.states:
            mdps = []
            for _ in range(self._num_episodes):
                mdps.append(tgt_generator.generate_log(state))
            log[state] = mdps

        for state, mdps in log.items():
            avg = RunningAverage()
            for mdp in mdps:
                discount = 1.0
                r = 0.0
                for t in mdp:
                    r += discount * t.reward
                    discount *= self._gamma
                avg.add(r)
            self._state_values[state] = avg.average
Example #2
0
    def test_gridworld_sequential_adapter(self):
        """
        Create a gridworld environment, logging policy, and target policy
        Evaluates target policy using the direct OPE sequential doubly robust estimator,
        then transforms the log into an evaluation data page which is passed to the ope adapter.

        This test is meant to verify the adaptation of EDPs into RLEstimatorInputs as employed
        by ReAgent since ReAgent provides EDPs to Evaluators. Going from EDP -> RLEstimatorInput
        is more involved than RLEstimatorInput -> EDP since the EDP does not store the state
        at each timestep in each MDP, only the corresponding logged outputs & model outputs.
        Thus, the adapter must do some tricks to represent these timesteps as states so the
        ope module can extract the correct outputs.

        Note that there is some randomness in the model outputs since the model is purposefully
        noisy. However, the same target policy is being evaluated on the same logged walks through
        the gridworld, so the two results should be close in value (within 1).

        """
        random.seed(0)
        np.random.seed(0)
        torch.random.manual_seed(0)

        device = torch.device("cuda") if torch.cuda.is_available() else None

        gridworld = GridWorld.from_grid(
            [
                ["s", "0", "0", "0", "0"],
                ["0", "0", "0", "W", "0"],
                ["0", "0", "0", "0", "0"],
                ["0", "W", "0", "0", "0"],
                ["0", "0", "0", "0", "g"],
            ],
            max_horizon=TestOPEModuleAlgs.MAX_HORIZON,
        )

        action_space = ActionSpace(4)
        opt_policy = TabularPolicy(action_space)
        trainer = DPTrainer(gridworld, opt_policy)
        value_func = trainer.train(gamma=TestOPEModuleAlgs.GAMMA)

        behavivor_policy = RandomRLPolicy(action_space)
        target_policy = EpsilonGreedyRLPolicy(opt_policy,
                                              TestOPEModuleAlgs.NOISE_EPSILON)
        model = NoiseGridWorldModel(
            gridworld,
            action_space,
            epsilon=TestOPEModuleAlgs.NOISE_EPSILON,
            max_horizon=TestOPEModuleAlgs.MAX_HORIZON,
        )
        value_func = DPValueFunction(target_policy, model,
                                     TestOPEModuleAlgs.GAMMA)
        ground_truth = DPValueFunction(target_policy, gridworld,
                                       TestOPEModuleAlgs.GAMMA)

        log = []
        log_generator = PolicyLogGenerator(gridworld, behavivor_policy)
        num_episodes = TestOPEModuleAlgs.EPISODES
        for state in gridworld.states:
            for _ in range(num_episodes):
                log.append(log_generator.generate_log(state))

        estimator_input = RLEstimatorInput(
            gamma=TestOPEModuleAlgs.GAMMA,
            log=log,
            target_policy=target_policy,
            value_function=value_func,
            ground_truth=ground_truth,
        )

        edp = rlestimator_input_to_edp(estimator_input,
                                       len(model.action_space))

        dr_estimator = SeqDREstimator(weight_clamper=None,
                                      weighted=False,
                                      device=device)

        module_results = SequentialOPEstimatorAdapter.estimator_results_to_cpe_estimate(
            dr_estimator.evaluate(estimator_input))
        adapter_results = SequentialOPEstimatorAdapter(
            dr_estimator, TestOPEModuleAlgs.GAMMA, device=device).estimate(edp)

        self.assertAlmostEqual(
            adapter_results.raw,
            module_results.raw,
            delta=TestOPEModuleAlgs.CPE_PASS_BAR,
        ), f"OPE adapter results differed too much from underlying module (Diff: {abs(adapter_results.raw - module_results.raw)} > {TestOPEModuleAlgs.CPE_PASS_BAR})"
        self.assertLess(
            adapter_results.raw, TestOPEModuleAlgs.CPE_MAX_VALUE
        ), f"OPE adapter results are too large ({adapter_results.raw} > {TestOPEModuleAlgs.CPE_MAX_VALUE})"
Example #3
0
                                action_space,
                                epsilon=0.3,
                                max_horizon=1000)
    value_func = DPValueFunction(target_policy, model, GAMMA)
    ground_truth = DPValueFunction(target_policy, gridworld, GAMMA)

    logging.info(f"Target Policy ground truth values:\n"
                 f"{gridworld.dump_value_func(ground_truth)}")

    log = {}
    log_generator = PolicyLogGenerator(gridworld, behavivor_policy)
    num_episodes = 200
    for state in gridworld.states:
        mdps = []
        for _ in range(num_episodes):
            mdps.append(log_generator.generate_log(state))
        log[state] = mdps
        logging.info(f"Generated {len(mdps)} logs for {state}")

    estimator_input = RLEstimatorInput(
        gamma=GAMMA,
        log=log,
        target_policy=target_policy,
        value_function=value_func,
        ground_truth=ground_truth,
    )

    DMEstimator(device=device).evaluate(estimator_input)

    IPSEstimator(weight_clamper=None, weighted=False,
                 device=device).evaluate(estimator_input)
Example #4
0
class MonteCarloTrainer(object):
    def __init__(self, env: Environment, policy: TabularPolicy):
        self._env = env
        self._policy = policy
        self._log_generator = PolicyLogGenerator(env, policy)

    def train(
        self,
        iterations: int,
        gamma: float = 0.9,
        first_visit: bool = True,
        update_interval: int = 20,
    ):
        i = 0
        value_counts = {}
        while i < iterations:
            i += 1
            for state in self._env.states:
                mdp = self._log_generator.generate_log(state)
                if first_visit:
                    vcounts = {}
                    for t in mdp:
                        if t.last_state is None or t.action is None:
                            continue
                        key = (t.last_state, t.action)
                        if key in vcounts:
                            vcounts[key] += 1
                        else:
                            vcounts[key] = 1
                    g = 0
                    for t in reversed(mdp):
                        if t.last_state is None or t.action is None:
                            continue
                        g = gamma * g + t.reward
                        key = (t.last_state, t.action)
                        vc = vcounts[key]
                        if vc > 1:
                            self._update_state_value(value_counts,
                                                     t.last_state, t.action, g)
                        vc -= 1
                        if vc == 0:
                            del vcounts[key]
                        else:
                            vcounts[key] = vc
                else:
                    g = 0
                    for t in reversed(mdp):
                        if t.last_state is None or t.action is None:
                            continue
                        g = gamma * g + t.reward
                        self._update_state_value(value_counts, t.last_state,
                                                 t.action, g)
            if i % update_interval == 0 and self._update_policy(value_counts):
                break

    def _update_state_value(self, value_counts, state, action, g: float):
        key = (state, action)
        sv, sc = value_counts[key] if key in value_counts else (0.0, 0)
        sc += 1
        sv = sv + (g - sv) / sc
        value_counts[key] = (sv, sc)

    def _update_policy(self, value_counts) -> bool:
        stable = True
        for state in self._env.states:
            probs = []
            for a in self._policy.action_space:
                key = (state, a)
                if key not in value_counts:
                    probs.append(0.0)
                else:
                    v, c = value_counts[key]
                    probs.append(v * c)
            probs = torch.nn.functional.softmax(torch.tensor(probs),
                                                dim=0).tolist()
            if self._policy.update(state, probs) >= 1.0e-6:
                stable = False
        return stable
Example #5
0
class MonteCarloValueFunction(TabularValueFunction):
    def __init__(
        self,
        policy: RLPolicy,
        env: Environment,
        gamma: float = 0.99,
        first_visit: bool = True,
        count_threshold: int = 100,
        max_iteration: int = 200,
    ):
        super().__init__(policy, env, gamma)
        self._env = env
        self._first_visit = first_visit
        self._count_threshold = count_threshold
        self._max_iteration = max_iteration
        self._log_generator = PolicyLogGenerator(env, policy)
        self._state_counts = {}

    def _state_value(self, state: State):
        i = 0
        state_count = self._state_counts[
            state] if state in self._state_counts else 0
        while state_count < self._count_threshold and i < self._max_iteration:
            i += 1
            mdp = self._log_generator.generate_log(state)
            if self._first_visit:
                state_counts = {}
                for t in mdp:
                    if t.last_state is None:
                        continue
                    if t.last_state in state_counts:
                        state_counts[t.last_state] += 1
                    else:
                        state_counts[t.last_state] = 1
                g = 0
                for t in reversed(mdp):
                    if t.last_state is None:
                        continue
                    g = self._gamma * g + t.reward
                    counts = state_counts[t.last_state]
                    if counts > 1:
                        self._update_state_value(t.last_state, g)
                    counts -= 1
                    if counts == 0:
                        del state_counts[t.last_state]
                    else:
                        state_counts[t.last_state] = counts
            else:
                g = 0
                for t in reversed(mdp):
                    if t.last_state is None:
                        continue
                    g = self._gamma * g + t.reward
                    self._update_state_value(t.last_state, g)
            state_count = (self._state_counts[state]
                           if state in self._state_counts else 0)
        return super()._state_value(state)

    def _update_state_value(self, state: State, g: float):
        sv = super()._state_value(state)
        sc = self._state_counts[state] if state in self._state_counts else 0
        sc += 1
        sv = sv + (g - sv) / sc
        self._state_values[state] = sv
        self._state_counts[state] = sc

    def state_value(self, state: State) -> float:
        return self._state_value(state)

    def reset(self, clear_state_values: bool = False):
        if clear_state_values:
            self._state_values.clear()
            self._state_counts.clear()