def run_rollouts(cost, sa, N=300): agent = Agent() agent.register(read_bo_policy(cost, note='human_states')) states = [] actions = [] qs = [] env = make_env(cost) agent.register(env) for state, action in sa: if action == env.term_action: r = env.expected_term_reward(state) states.extend([state] * N) actions.extend([action] * N) qs.extend([r] * N) else: for _ in range(N): env._state = state s1, r, *_ = env.step(action) env.init = s1 trace = agent.run_episode() states.extend([state] + trace['states'][:-1]) actions.extend([action] + trace['actions']) qs.extend(cum_returns([r] + trace['rewards'])) return list(map(encode_state, states)), actions, qs
def get_q_error(theta, envs, parallel=None): agent = Agent() Q = LiederQ(theta) agent.register(Q) agent.register(MaxQPolicy(Q)) err = 0 for e in envs: agent.register(e) agent.run_episode() tr = agent.ep_trace q = cum_returns(tr['rewards']) q_hat = np.array([Q.predictOne(s, a) for s, a in zip(tr['states'], tr['actions'])]) err += np.sum((q - q_hat) ** 2) return err
def run_rollouts(env_type, pol_seed, env_seed=None, n_env=1000, n_per_env=30, overwrite=False, i=0): if env_seed is None: env_seed = pol_seed + 999 file = f'data/rollouts/{env_type}_{pol_seed}_{env_seed}.pkl' if os.path.isfile(file) and not overwrite: print(f'{file} already exists. Skipping.') return try: polfile = f'data/policies/{env_type}_{pol_seed}.pkl' pol = load(polfile) agent = Agent() agent.register(pol) except FileNotFoundError: print(f'{polfile} not found. Skipping.') return envs = make_envs(COST, n_env, env_seed, env_type) # Ensures that all features are computed. phi_pol = MouselabPolicy(dict(voi_myopic=1, vpi_action=1, vpi_full=1)) agent.register(phi_pol) def phi(s, a): return phi_pol.phi(s, a)[:5] data = defaultdict(list) for env_i, env in enumerate( tqdm(envs, desc=f'{env_type} {pol_seed}', position=i)): agent.register(env) for _ in range(n_per_env): trace = agent.run_episode() q = cum_returns(trace['rewards']) for s, a, q in zip(trace['states'], trace['actions'], q): data['env_i'].append(env_i) data['s'].append(encode_state(s)) data['a'].append(a) data['q'].append(q) data['phi'].append(phi(s, a)) for k, v in data.items(): if k != 's': data[k] = np.array(v) dump(dict(data), file)
class MetaBestFirstSearchEnv(gym.Env): """A meta-MDP for best first search with a deterministic transition model.""" Node = namedtuple('Node', ('state', 'path', 'reward', 'done')) State = namedtuple('State', ('frontier', 'reward_to_state', 'best_done')) TERM = 'TERM' def __init__(self, env, eval_node, expansion_cost=0.01): super().__init__() self.env = env self.expansion_cost = -abs(expansion_cost) # This guy interacts with the external environment, what a chump! self.surface_agent = Agent() self.surface_agent.register(self.env) self.eval_node = eval_node def _reset(self): self.env.reset() self.model = Model( self.env) # warning: this breaks if env resets again start = self.Node(self.env._state, [], 0, False) frontier = PriorityQueue(key=self.eval_node( noisy=True)) # this is really part of the Meta Policy frontier.push(start) reward_to_state = defaultdict(lambda: -np.inf) best_done = None # Warning: state is mutable (and we mutate it!) self._state = self.State(frontier, reward_to_state, best_done) return self._state def _step(self, action): """Expand a node in the frontier.""" if action is self.TERM: # The return of one episode in the external env is # one reward in the MetaSearchEnv. trace = self._execute_plan() external_reward = trace['return'] return None, external_reward, True, {'trace': trace} else: return self._expand_node(action), self.expansion_cost, False, {} def _execute_plan(self): frontier, reward_to_state, best_done = self._state if not best_done: raise RuntimeError('Cannot make plan.') policy = FixedPlanPolicy(best_done.path) self.surface_agent.register(policy) trace = self.surface_agent.run_episode(reset=False) return trace # elif frontier: # plan = min(best_done, frontier.pop(), key=eval_node) # plan = frontier.pop() def _expand_node(self, node): frontier, reward_to_state, best_done = self._state s0, p0, r0, _ = node for a, s1, r, done in self.model.options(s0): node1 = self.Node(s1, p0 + [a], r0 + r, done) if node1.reward <= reward_to_state[s1] - 0.002: continue # cannot be better than an existing node reward_to_state[s1] = node1.reward if done: best_done = max((best_done, node1), key=self.eval_node(noisy=False)) else: frontier.push(node1) self._state = self.State(frontier, reward_to_state, best_done) return self._state