예제 #1
0
def get_q_error(theta, envs, parallel=None):
    agent = Agent()
    Q = LiederQ(theta)
    agent.register(Q)
    agent.register(MaxQPolicy(Q))
    err = 0
    for e in envs:
        agent.register(e)
        agent.run_episode()
        tr = agent.ep_trace
        q = cum_returns(tr['rewards'])
        q_hat = np.array([Q.predictOne(s, a) for s, a in zip(tr['states'], tr['actions'])])
        err += np.sum((q - q_hat) ** 2)
    return err
예제 #2
0
def run_rollouts(env_type,
                 pol_seed,
                 env_seed=None,
                 n_env=1000,
                 n_per_env=30,
                 overwrite=False,
                 i=0):
    if env_seed is None:
        env_seed = pol_seed + 999
    file = f'data/rollouts/{env_type}_{pol_seed}_{env_seed}.pkl'
    if os.path.isfile(file) and not overwrite:
        print(f'{file} already exists. Skipping.')
        return

    try:
        polfile = f'data/policies/{env_type}_{pol_seed}.pkl'
        pol = load(polfile)
        agent = Agent()
        agent.register(pol)
    except FileNotFoundError:
        print(f'{polfile} not found. Skipping.')
        return

    envs = make_envs(COST, n_env, env_seed, env_type)

    # Ensures that all features are computed.
    phi_pol = MouselabPolicy(dict(voi_myopic=1, vpi_action=1, vpi_full=1))
    agent.register(phi_pol)

    def phi(s, a):
        return phi_pol.phi(s, a)[:5]

    data = defaultdict(list)
    for env_i, env in enumerate(
            tqdm(envs, desc=f'{env_type} {pol_seed}', position=i)):
        agent.register(env)
        for _ in range(n_per_env):
            trace = agent.run_episode()
            q = cum_returns(trace['rewards'])
            for s, a, q in zip(trace['states'], trace['actions'], q):
                data['env_i'].append(env_i)
                data['s'].append(encode_state(s))
                data['a'].append(a)
                data['q'].append(q)
                data['phi'].append(phi(s, a))

    for k, v in data.items():
        if k != 's':
            data[k] = np.array(v)
    dump(dict(data), file)
예제 #3
0
def run_rollouts(cost, sa, N=300):
    agent = Agent()
    agent.register(read_bo_policy(cost, note='human_states'))
    states = []
    actions = []
    qs = []
    env = make_env(cost)
    agent.register(env)
    for state, action in sa:
        if action == env.term_action:
            r = env.expected_term_reward(state)
            states.extend([state] * N)
            actions.extend([action] * N)
            qs.extend([r] * N)
        else:
            for _ in range(N):
                env._state = state
                s1, r, *_ = env.step(action)
                env.init = s1
                trace = agent.run_episode()
                states.extend([state] + trace['states'][:-1])
                actions.extend([action] + trace['actions'])
                qs.extend(cum_returns([r] + trace['rewards']))
    return list(map(encode_state, states)), actions, qs
예제 #4
0
class MetaBestFirstSearchEnv(gym.Env):
    """A meta-MDP for best first search with a deterministic transition model."""
    Node = namedtuple('Node', ('state', 'path', 'reward', 'done'))
    State = namedtuple('State', ('frontier', 'reward_to_state', 'best_done'))
    TERM = 'TERM'

    def __init__(self, env, eval_node, expansion_cost=0.01):
        super().__init__()
        self.env = env
        self.expansion_cost = -abs(expansion_cost)

        # This guy interacts with the external environment, what a chump!
        self.surface_agent = Agent()
        self.surface_agent.register(self.env)
        self.eval_node = eval_node

    def _reset(self):
        self.env.reset()
        self.model = Model(
            self.env)  # warning: this breaks if env resets again
        start = self.Node(self.env._state, [], 0, False)
        frontier = PriorityQueue(key=self.eval_node(
            noisy=True))  # this is really part of the Meta Policy
        frontier.push(start)
        reward_to_state = defaultdict(lambda: -np.inf)
        best_done = None
        # Warning: state is mutable (and we mutate it!)
        self._state = self.State(frontier, reward_to_state, best_done)
        return self._state

    def _step(self, action):
        """Expand a node in the frontier."""
        if action is self.TERM:
            # The return of one episode in the external env is
            # one reward in the MetaSearchEnv.
            trace = self._execute_plan()
            external_reward = trace['return']
            return None, external_reward, True, {'trace': trace}
        else:
            return self._expand_node(action), self.expansion_cost, False, {}

    def _execute_plan(self):
        frontier, reward_to_state, best_done = self._state

        if not best_done:
            raise RuntimeError('Cannot make plan.')

        policy = FixedPlanPolicy(best_done.path)
        self.surface_agent.register(policy)
        trace = self.surface_agent.run_episode(reset=False)
        return trace

        # elif frontier:
        #     plan = min(best_done, frontier.pop(), key=eval_node)
        #     plan = frontier.pop()

    def _expand_node(self, node):
        frontier, reward_to_state, best_done = self._state
        s0, p0, r0, _ = node

        for a, s1, r, done in self.model.options(s0):
            node1 = self.Node(s1, p0 + [a], r0 + r, done)
            if node1.reward <= reward_to_state[s1] - 0.002:
                continue  # cannot be better than an existing node
            reward_to_state[s1] = node1.reward
            if done:
                best_done = max((best_done, node1),
                                key=self.eval_node(noisy=False))
            else:
                frontier.push(node1)

        self._state = self.State(frontier, reward_to_state, best_done)
        return self._state