class ValSearchPolicy(Policy): """Searches for the maximum reward path using a model.""" def __init__(self, V, replan=False, epsilon=0, noise=1, anneal=1, **kwargs): super().__init__(**kwargs) self.V = V self.replan = replan self.epsilon = epsilon self.noise = noise self.anneal = anneal self.history = None self.model = None self.plan = None def start_episode(self, state): self.history = Counter() self.model = Model(self.env) self.plan = iter(()) # start with no plan def finish_episode(self, trace): self.ep_trace['berries'] = self.env._observe()[-1] def act(self, state): # return self.env.action_space.sample() self.history[state] += 1 try: if self.replan: raise StopIteration() else: return next(self.plan) except StopIteration: self.plan = iter(self.make_plan(state)) return next(self.plan) def make_plan(self, state, expansions=2000): Node = namedtuple('Node', ('state', 'path', 'reward', 'done')) env = self.env V = memoize(self.V.predict) self.node_history = [] def eval_node(node, noisy=False): if not node.path: return np.inf # the empty plan has infinite cost obs = env._observe(node.state) noise = np.random.rand() * ( self.noise * self.anneal**self.i_episode) if noisy else 0 value = 0 if node.done else V(obs)[0] boredom = -0.1 * self.history[obs] score = node.reward + value + noise + boredom return -score start = Node(env._state, [], 0, False) frontier = PriorityQueue(key=eval_node) frontier.push(start) reward_to_state = defaultdict(lambda: -np.inf) reward_to_state[start.state] = 0 best_finished = start def expand(node): nonlocal best_finished best_finished = min((best_finished, node), key=eval_node) s0, p0, r0, _ = node for a, s1, r, done in self.model.options(s0): node1 = Node(s1, p0 + [a], r0 + r, done) if node1.reward <= reward_to_state[s1]: continue # cannot be better than an existing node self.node_history.append({ 'path': node1.path, 'r': node1.reward, 'b': self.env._observe(node1.state)[-1], 'v': -eval_node(node1) }) reward_to_state[s1] = node1.reward if done: best_finished = min((best_finished, node1), key=eval_node) else: frontier.push(node1) for i in range(expansions): if frontier: expand(frontier.pop()) else: break if frontier: plan = min(best_finished, frontier.pop(), key=eval_node) else: plan = best_finished # choices = concat([completed, map(get(1), take(100, frontier))]) # plan = min(choices, key=eval_node(noisy=True)) self.log( i, len(plan.path), -round(eval_node(plan, noisy=False), 2), plan.done, ) # self._trace['paths'].append(plan.path) return plan.path
class Astar(Policy): """A* search finds the shortest path to a goal.""" def __init__(self, heuristic): assert 0 # this implementation is incorrect super().__init__() self.heuristic = heuristic self.plan = iter(()) def start_episode(self, state): self.history = Counter() self.model = Model(self.env) def act(self, state): # return self.env.action_space.sample() self.history[state] += 1 try: return next(self.plan) except StopIteration: self.plan = iter(self.make_plan(state)) return next(self.plan) def eval_node(self, node): if not node.path: return np.inf # the empty plan has infinite cost obs = self.env._observe(node.state) value = 0 if node.done else self.heuristic(self.env, obs) boredom = -0.1 * self.history[obs] score = node.reward + value + boredom return -score def make_plan(self, state, expansions=5000): Node = namedtuple('Node', ('state', 'path', 'reward', 'done')) eval_node = self.eval_node start = Node(self.env._state, [], 0, False) frontier = PriorityQueue(key=eval_node) frontier.push(start) reward_to_state = defaultdict(lambda: -np.inf) # import IPython; IPython.embed() best_finished = start def expand(node): # print(node.state, node.reward, self.rts[node.state], V(env._observe(node.state))) # time.sleep(0.1) nonlocal best_finished # best_finished = min((best_finished, node), key=eval_node) s0, p0, r0, _ = node for a, s1, r, done in self.model.options(s0): node1 = Node(s1, p0 + [a], r0 + r, done) if node1.reward <= reward_to_state[s1]: # print('abandon') pass continue # cannot be better than an existing node # self.save('node', node) reward_to_state[s1] = node1.reward if done: best_finished = min((best_finished, node1), key=eval_node) else: frontier.push(node1) for i in range(expansions): self.save('frontier', [n[1].state for n in frontier]) if frontier: expand(frontier.pop()) else: break if frontier: # plan = min(best_finished, frontier.pop(), key=eval_node) plan = frontier.pop() raise RuntimeError('No plan found.') else: plan = best_finished # choices = concat([completed, map(get(1), take(100, frontier))]) # plan = min(choices, key=eval_node(noisy=True)) # self.log( # i, # len(plan.path), # -round(eval_node(plan, noisy=False), 2), # plan.done, # ) # self._trace['paths'].append(plan.path) self.save('plan', plan) return plan.path
class MetaBestFirstSearchEnv(gym.Env): """A meta-MDP for best first search with a deterministic transition model.""" Node = namedtuple('Node', ('state', 'path', 'reward', 'done')) State = namedtuple('State', ('frontier', 'reward_to_state', 'best_done')) TERM = 'TERM' def __init__(self, env, eval_node, expansion_cost=0.01): super().__init__() self.env = env self.expansion_cost = -abs(expansion_cost) # This guy interacts with the external environment, what a chump! self.surface_agent = Agent() self.surface_agent.register(self.env) self.eval_node = eval_node def _reset(self): self.env.reset() self.model = Model( self.env) # warning: this breaks if env resets again start = self.Node(self.env._state, [], 0, False) frontier = PriorityQueue(key=self.eval_node( noisy=True)) # this is really part of the Meta Policy frontier.push(start) reward_to_state = defaultdict(lambda: -np.inf) best_done = None # Warning: state is mutable (and we mutate it!) self._state = self.State(frontier, reward_to_state, best_done) return self._state def _step(self, action): """Expand a node in the frontier.""" if action is self.TERM: # The return of one episode in the external env is # one reward in the MetaSearchEnv. trace = self._execute_plan() external_reward = trace['return'] return None, external_reward, True, {'trace': trace} else: return self._expand_node(action), self.expansion_cost, False, {} def _execute_plan(self): frontier, reward_to_state, best_done = self._state if not best_done: raise RuntimeError('Cannot make plan.') policy = FixedPlanPolicy(best_done.path) self.surface_agent.register(policy) trace = self.surface_agent.run_episode(reset=False) return trace # elif frontier: # plan = min(best_done, frontier.pop(), key=eval_node) # plan = frontier.pop() def _expand_node(self, node): frontier, reward_to_state, best_done = self._state s0, p0, r0, _ = node for a, s1, r, done in self.model.options(s0): node1 = self.Node(s1, p0 + [a], r0 + r, done) if node1.reward <= reward_to_state[s1] - 0.002: continue # cannot be better than an existing node reward_to_state[s1] = node1.reward if done: best_done = max((best_done, node1), key=self.eval_node(noisy=False)) else: frontier.push(node1) self._state = self.State(frontier, reward_to_state, best_done) return self._state