Ejemplo n.º 1
0
    def _expand_graph_node(self, node):
        assert bool(node.rewards) == bool(node.edges)
        if node.edges:
            return  # graph node is expanded already

        # neighbours are ordered in the order of actions:
        # 0, 1, ..., _model.num_actions
        observations, rewards, dones, infos, states = \
            yield from self._model.predict_steps(
                node.state,
                list(space_utils.element_iter(self._model.action_space))
            )
        solved = [info.get('solved', False) for info in infos]
        node.bonus = [info.get('bonus', 0.) for info in infos]
        self._bonuses.append(max(node.bonus))

        value_batch = yield Request(RequestType.AGENT_PREDICTION,
                                    np.array(observations))

        for idx, action in enumerate(
                space_utils.element_iter(self._action_space)):
            node.rewards[action] = rewards[idx]
            new_node = self._state2node.get(states[idx], None)
            if new_node is None:
                if dones[idx]:
                    child_value = self._value_traits.zero
                else:
                    [child_value] = value_batch[idx]
                new_node = self._initialize_graph_node(child_value,
                                                       states[idx],
                                                       dones[idx],
                                                       solved=solved[idx])
            node.edges[action] = new_node
Ejemplo n.º 2
0
    def _expand_leaf(self, leaf):
        if leaf is None:  # Dead End
            return self._value_traits.dead_end

        if leaf.terminal:  # Terminal state
            return self._value_traits.zero

        # neighbours are ordered in the order of actions:
        # 0, 1, ..., _model.num_actions
        obs, rewards, dones, solved, states = self._children_of_state(
            leaf.state)

        value_batch = yield np.array(obs)

        for idx, action in enumerate(
                space_utils.element_iter(self._action_space)):
            leaf.rewards[action] = rewards[idx]
            new_node = self._state2node.get(states[idx], None)
            if new_node is None:
                if dones[idx]:
                    child_value = self._value_traits.zero
                else:
                    child_value = value_batch[idx]
                new_node = self._initialize_graph_node(child_value,
                                                       states[idx],
                                                       dones[idx],
                                                       solved=solved[idx])
            leaf.children[action] = TreeNode(new_node)

        return leaf.value_acc.get()
Ejemplo n.º 3
0
    def _expand_graph_node(self, node):
        assert bool(node.rewards) == bool(node.edges)
        if (len(node.edges) > 0 or  # graph node is expanded already
                node.solved or node.terminal):
            return

        # neighbours are ordered in the order of actions:
        # 0, 1, ..., _model.num_actions
        observations, rewards, dones, infos, states = \
            yield from self._model.predict_steps(
                node.state,
                list(space_utils.element_iter(self._model.action_space))
            )
        # solved = [info.get('solved', False) for info in infos]
        assert all([reward in (0, 1) for reward in rewards]), \
            'We assume that env is deterministic, and there are goal states ' \
            'obtaining which gives you reward=1 and ends episode. All other ' \
            'actions should give reward=0'
        solved = [reward == 1 for reward in rewards]

        node.bonus = [
            self._filter_bonus(info.get('bonus', 0.), reward, done)
            for info, reward, done in zip(infos, rewards, dones)
        ]
        node.value_acc.add_bonus(max(node.bonus))
        self._bonuses.append(max(node.bonus))

        value_batch = yield Request(RequestType.AGENT_PREDICTION,
                                    np.array(observations))

        for idx, action in enumerate(
                space_utils.element_iter(self._action_space)):
            node.rewards[action] = rewards[idx]
            new_node = self._state2node.get(states[idx], None)
            if new_node is None:
                if dones[idx]:
                    child_value = self._value_traits.zero
                else:
                    [child_value] = value_batch[idx]
                new_node = self._initialize_graph_node(child_value,
                                                       states[idx],
                                                       dones[idx],
                                                       solved=solved[idx])
            node.edges[action] = new_node
        self._update_from_node(node)
Ejemplo n.º 4
0
    def qualities(self, observation, model):
        actions = list(space_utils.element_iter(model.action_space))
        (observations, rewards,
         dones) = yield from model.predict_steps(actions, include_state=False)

        if not self._use_policy:
            values = yield observations
        else:
            (values, _) = yield observations

        values = np.reshape(values, -1)

        return list(rewards + self._discount * values * (1 - dones))
Ejemplo n.º 5
0
    def _children_of_state(self, parent_state):
        old_state = self._model.clone_state()

        self._model.restore_state(parent_state)

        def step_and_rewind(action):
            (observation, reward, done, info) = self._model.step(action)
            state = self._model.clone_state()
            solved = 'solved' in info and info['solved']
            self._model.restore_state(parent_state)
            return (observation, reward, done, solved, state)

        results = zip(*[
            step_and_rewind(action)
            for action in space_utils.element_iter(self._model.action_space)
        ])
        self._model.restore_state(old_state)
        return results
Ejemplo n.º 6
0
    def predict_step(self, observation, action, repeat_fix=True):
        """Predicts next state, reward and done.

        Args:
            observation (np.ndarray): Array of shape (height, width, n_channels)
                of one-hot encoded observation (along axis=-1).
            action (int): Action performed by the agent.
            repeat_fix (bool): Indicates if 'hack' fix should be used. There is
                a problem with passing requests to RequestHandler that shapes
                of tensors have to be the same for each request, and in this
                case we always pass tensor with stacked predictions requests
                for each action in the state.

        Yields:
            request (Request): Model prediction request with one-hot encoded
                input state and action; handled by RequestHandler.

        Returns:
            pred_obs (np.ndarray): Array of shape (height, width,
                n_channels) of one-hot encoded state (along axis=-1).
            reward (float): Reward received by the agent.
            done (bool): Indicates if episode terminated.
            info (dict): Environment additional info.
        """
        if repeat_fix:
            actions = list(space_utils.element_iter(self.action_space))
            next_obs, rewards, dones, _, _ = yield from self.predict_steps(
                self.obs2state(observation), actions)
            return next_obs[action], rewards[action], dones[action]
        batched_observation = np.expand_dims(observation, axis=0)
        batched_action = np.expand_dims(action, axis=0)

        pred_obs, reward, done, info = yield from self._batch_predict_steps(
            batched_observation, batched_action)

        # Unbatch predictions
        pred_obs = tf.squeeze(pred_obs, axis=0).numpy()
        reward = reward.item()
        done = done.item()
        info = info.item()

        return pred_obs, reward, done, info
Ejemplo n.º 7
0
    def qualities(self, observation, model):
        init_state = model.clone_state()

        child_qualities = []
        for init_action in space_utils.element_iter(model.action_space):
            (observation, init_reward,
             done) = yield from model.step(init_action)
            yield from self._rollout_agent.reset(model, observation)
            value = 0
            total_discount = 1
            time = 0
            while not done and time < self._time_limit:
                (action, _) = yield from self._rollout_agent.act(observation)
                (observation, reward, done) = yield from model.step(action)
                value += total_discount * reward
                total_discount *= self._discount
                time += 1
            child_qualities.append(init_reward + self._discount * value)
            model.restore_state(init_state)
        return child_qualities
Ejemplo n.º 8
0
    def __call__(self, observation, model):
        init_state = model.clone_state()

        child_qualities = []
        for init_action in space_utils.element_iter(model.action_space):
            (observation, init_reward, done, _) = model.step(init_action)
            yield from self._agent.reset(model, observation)
            value = 0
            total_discount = 1
            time = 0
            while not done and time < self._time_limit:
                (action, _) = yield from self._agent.act(observation)
                (observation, reward, done, _) = model.step(action)
                value += total_discount * reward
                total_discount *= self._discount
                time += 1
            child_qualities.append(init_reward + self._discount * value)
            model.restore_state(init_state)
        prior = _uniform_prior(len(child_qualities))
        return list(zip(child_qualities, prior))
Ejemplo n.º 9
0
    def _expand_leaf(self, leaf):
        # print("MCTS leaf")
        if leaf is None:  # Dead End
            return self._value_traits.dead_end

        if leaf.terminal:  # Terminal state
            return self._value_traits.zero

        # neighbours are ordered in the order of actions:
        # 0, 1, ..., _model.num_actions
        obs, rewards, dones, solved, states = self._children_of_state(
            leaf.state)
        # print(full_obs)
        # obs = np.concatenate([full_obs['observation'], full_obs['desired_goal']], axis=-1)
        # print("obss\n\n\n", obs)
        # print("obss\n\n\n")

        # print('expand_leaf obs', np.array(obs).astype(np.float32))

        value_batch = yield np.array(obs).astype(np.float32)

        # print('value batch', value_batch)

        # print("expand_fin")

        for idx, action in enumerate(
                space_utils.element_iter(self._action_space)):
            leaf.rewards[action] = rewards[idx]
            new_node = self._state2node.get(states[idx], None)
            if new_node is None:
                if dones[idx]:
                    child_value = self._value_traits.zero
                else:
                    child_value = value_batch[idx]
                new_node = self._initialize_graph_node(child_value,
                                                       states[idx],
                                                       dones[idx],
                                                       solved=solved[idx])
            leaf.children[action] = TreeNode(new_node)

        return leaf.value_acc.get()
Ejemplo n.º 10
0
    def __call__(self, observation, model):
        del observation

        init_state = model.clone_state()

        def step_and_rewind(action):
            (observation, reward, done, _) = model.step(action)
            model.restore_state(init_state)
            return (observation, reward, done)

        (observations, rewards, dones) = data.nested_stack([
            step_and_rewind(action)
            for action in space_utils.element_iter(model.action_space)
        ])
        # Run the network to predict values for children.
        values = yield observations
        # (batch_size, 1) -> (batch_size,)
        values = np.reshape(values, -1)
        # Compute the final qualities, masking out the "done" states.
        child_qualities = list(rewards + self._discount * values * (1 - dones))
        prior = _uniform_prior(len(child_qualities))
        return list(zip(child_qualities, prior))
Ejemplo n.º 11
0
    def _children_of_state(self, parent_state):
        # print("MCTS ch")
        old_state = self._model.clone_state()

        self._model.restore_state(parent_state)

        def step_and_rewind(action):
            (full_observation, reward, done, info) = self._model.step(action)
            observation = np.concatenate([
                full_observation['observation'],
                full_observation['desired_goal']
            ],
                                         axis=-1)
            state = self._model.clone_state()
            solved = 'solved' in info and info['solved']
            self._model.restore_state(parent_state)
            return (observation, reward, done, solved, state)

        results = zip(*[
            step_and_rewind(action)
            for action in space_utils.element_iter(self._model.action_space)
        ])
        self._model.restore_state(old_state)
        return results