Ejemplo n.º 1
0
class ChildrenValuePrinter(HumanPrintWrapper):
    def __init__(self, env, value_fun):
        """

    Args:
      value_fun: callable: obs, states -> value, which would be call by key
        `states`
    """
        super().__init__(env)
        self.render_env = SokobanEnv(**env.init_kwargs)
        self.value_fun = value_fun

    def formatted_state_value(self, state):
        return "{:.2f}".format(self.value_fun(states=state)[0][0])

    def build_texts(self, obs, reward, done, info):
        child_values = list()
        state = self.env.clone_full_state()
        value_str = self.formatted_state_value(state)
        for action in range(self.render_env.action_space.n):
            self.render_env.restore_full_state(state)
            self.render_env.step(action)
            child_state = self.render_env.clone_full_state()
            child_value_str = self.formatted_state_value(child_state)
            child_values.append(child_value_str)
        print('Children values: {}'.format(" ".join(child_values)))
        return [
            'Value: {}'.format(value_str),
            'Children values: {}'.format(" ".join(child_values))
        ]
class PolicyFromFullTree(Policy):
    def __init__(self, value_fn, env_kwargs, depth=4):
        self.render_env = SokobanEnv(**env_kwargs)
        self.env_n_actions = self.render_env.action_space.n
        self.value_function = value_fn
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()
        self.depth = depth
        self.nodes = dict()

    def best_actions(self, state):
        # Produce all action sequences
        seq_ = [range(self.env.action_space.n)] * self.depth
        action_seq = list(product(*seq_))
        # print("len(action_seq) {}".format(len(action_seq)))
        for actions in action_seq:
            root_action = actions[0]
            self.env.restore_full_state(state)
            branch_reward = 0
            current_depth = 0
            for action in actions:
                current_depth += 1
                ob, reward, done, _ = self.env.step(action)
                branch_reward += reward
                node = tuple(self.env.clone_full_state())
                if node not in self.nodes:
                    value = self.value_function(
                        states=np.array(node)
                    )  # self.model.predict(np.expand_dims(ob, axis=0))[0]
                    if done:
                        value += 1000
                    self.nodes[node] = (value, branch_reward, current_depth,
                                        root_action, actions[:current_depth])
                else:
                    value, previous_reward, previous_depth, _, _ = self.nodes[
                        node]
                    if previous_depth > current_depth:
                        # if previous_reward > branch_reward:
                        #   assert branch_reward > 10., "{} {}".format(previous_reward, branch_reward)
                        self.nodes[node] = (value, branch_reward,
                                            current_depth, root_action,
                                            actions[:current_depth])
                if done:
                    break
        # self.nodes.values()
        best_node = max(
            self.nodes.keys(),
            key=(lambda node: self.nodes[node][0] + self.nodes[node][1]))
        node_value, branch_reward, current_depth, root_action, actions = self.nodes[
            best_node]
        # print("Distinct leaves {}".format(len(self.nodes)))
        # print("Node value {}, reward {:.1f}, depth {}, action {}, actions {}".format(
        #     node_value, branch_reward, current_depth, root_action, actions))
        return [root_action]
class QFromV(object):
    def __init__(self,
                 value_function,
                 env_kwargs,
                 nan_for_zero_value=True,
                 copy_negative=True):
        self.value_function = value_function
        self.env = SokobanEnv(**env_kwargs)
        self.env.reset()
        self.nan_for_zero_value = nan_for_zero_value
        self.copy_negative_values = copy_negative

    @property
    def env_n_actions(self):
        return self.env.action_space.n

    def q_values(self, state):
        q_values = list()
        if self.nan_for_zero_value:
            # Value might not have children for Sokoban success states.
            if self.value_function(states=state) == 0:
                return [np.nan] * self.env_n_actions
        if self.copy_negative_values:
            # For speed-up
            val = self.value_function(states=state)[0]
            if val < 0:
                return [val] * self.env_n_actions

        for action in range(self.env_n_actions):
            self.env.restore_full_state(state)
            ob, reward, done, _ = self.env.step(action)
            value = reward
            child_state = self.env.clone_full_state()
            if not done:
                value += self.value_function(states=child_state)[0]
            q_values.append(float(value))
        return q_values