class ChildrenValuePrinter(HumanPrintWrapper): def __init__(self, env, value_fun): """ Args: value_fun: callable: obs, states -> value, which would be call by key `states` """ super().__init__(env) self.render_env = SokobanEnv(**env.init_kwargs) self.value_fun = value_fun def formatted_state_value(self, state): return "{:.2f}".format(self.value_fun(states=state)[0][0]) def build_texts(self, obs, reward, done, info): child_values = list() state = self.env.clone_full_state() value_str = self.formatted_state_value(state) for action in range(self.render_env.action_space.n): self.render_env.restore_full_state(state) self.render_env.step(action) child_state = self.render_env.clone_full_state() child_value_str = self.formatted_state_value(child_state) child_values.append(child_value_str) print('Children values: {}'.format(" ".join(child_values))) return [ 'Value: {}'.format(value_str), 'Children values: {}'.format(" ".join(child_values)) ]
class PolicyFromFullTree(Policy): def __init__(self, value_fn, env_kwargs, depth=4): self.render_env = SokobanEnv(**env_kwargs) self.env_n_actions = self.render_env.action_space.n self.value_function = value_fn self.env = SokobanEnv(**env_kwargs) self.env.reset() self.depth = depth self.nodes = dict() def best_actions(self, state): # Produce all action sequences seq_ = [range(self.env.action_space.n)] * self.depth action_seq = list(product(*seq_)) # print("len(action_seq) {}".format(len(action_seq))) for actions in action_seq: root_action = actions[0] self.env.restore_full_state(state) branch_reward = 0 current_depth = 0 for action in actions: current_depth += 1 ob, reward, done, _ = self.env.step(action) branch_reward += reward node = tuple(self.env.clone_full_state()) if node not in self.nodes: value = self.value_function( states=np.array(node) ) # self.model.predict(np.expand_dims(ob, axis=0))[0] if done: value += 1000 self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) else: value, previous_reward, previous_depth, _, _ = self.nodes[ node] if previous_depth > current_depth: # if previous_reward > branch_reward: # assert branch_reward > 10., "{} {}".format(previous_reward, branch_reward) self.nodes[node] = (value, branch_reward, current_depth, root_action, actions[:current_depth]) if done: break # self.nodes.values() best_node = max( self.nodes.keys(), key=(lambda node: self.nodes[node][0] + self.nodes[node][1])) node_value, branch_reward, current_depth, root_action, actions = self.nodes[ best_node] # print("Distinct leaves {}".format(len(self.nodes))) # print("Node value {}, reward {:.1f}, depth {}, action {}, actions {}".format( # node_value, branch_reward, current_depth, root_action, actions)) return [root_action]
class QFromV(object): def __init__(self, value_function, env_kwargs, nan_for_zero_value=True, copy_negative=True): self.value_function = value_function self.env = SokobanEnv(**env_kwargs) self.env.reset() self.nan_for_zero_value = nan_for_zero_value self.copy_negative_values = copy_negative @property def env_n_actions(self): return self.env.action_space.n def q_values(self, state): q_values = list() if self.nan_for_zero_value: # Value might not have children for Sokoban success states. if self.value_function(states=state) == 0: return [np.nan] * self.env_n_actions if self.copy_negative_values: # For speed-up val = self.value_function(states=state)[0] if val < 0: return [val] * self.env_n_actions for action in range(self.env_n_actions): self.env.restore_full_state(state) ob, reward, done, _ = self.env.step(action) value = reward child_state = self.env.clone_full_state() if not done: value += self.value_function(states=child_state)[0] q_values.append(float(value)) return q_values