Beispiel #1
0
    def _expand_leaf(self, leaf, observation):
        """Expands a leaf and returns its quality.

        The leaf's new children are assigned initial quality. The quality of the
        "best" new leaf is then backpropagated.

        Only modifies leaf - adds children with new qualities.

        Args:
            leaf (TreeNode): Leaf to expand.
            observation (np.ndarray): Observation received at leaf.

        Yields:
            Network prediction requests.

        Returns:
            float: Quality of a chosen child of the expanded leaf.
        """
        child_qualities_and_probs = yield from self._new_leaf_rater(
            observation, self._model
        )
        # This doesn't work with dynamic action spaces. TODO(koz4k): Fix.
        assert len(child_qualities_and_probs) == space_utils.max_size(
            self._action_space
        )
        leaf.children = [
            TreeNode(
                quality * self._leaf_quality_dampening +
                self._leaf_quality_bias,
                prob,
            )
            for (quality, prob) in child_qualities_and_probs
        ]
        action = self._choose_action(leaf, exploratory=True)
        return leaf.children[action].quality
 def _on_new_root(self, root):
     prior = np.array([child.prior_probability for child in root.children])
     noise = np.random.dirichlet([self._prior_noise_parameter] *
                                 space_utils.max_size(self._action_space))
     prior = ((1 - self._prior_noise_weight) * prior +
              self._prior_noise_weight * noise)
     for (child, p) in zip(root.children, prior):
         child.prior_probability = p
Beispiel #3
0
    def network_signature(self, observation_space, action_space):
        n_actions = space_utils.max_size(action_space)
        action_vector_sig = data.TensorSignature(shape=(n_actions, ))
        if self._use_policy:
            output_sig = (action_vector_sig, ) * 2
        else:
            output_sig = action_vector_sig

        return data.NetworkSignature(
            input=space_utils.signature(observation_space),
            output=output_sig,
        )
Beispiel #4
0
    def network_signature(self, observation_space, action_space):
        obs_sig = space_utils.signature(observation_space)
        if self._inject_log_temperature:
            input_sig = (obs_sig, data.TensorSignature(shape=(1,)))
        else:
            input_sig = obs_sig

        n_actions = space_utils.max_size(action_space)
        action_vector_sig = data.TensorSignature(shape=(n_actions,))
        output_sig = action_vector_sig

        return data.NetworkSignature(input=input_sig, output=output_sig)
Beispiel #5
0
    def network_signature(self, observation_space, action_space):
        n_actions = space_utils.max_size(action_space)
        if self._use_policy:
            return data.NetworkSignature(
                input=space_utils.signature(observation_space),
                output=(data.TensorSignature(shape=(1, )),
                        data.TensorSignature(shape=(n_actions, ))),
            )
        else:

            return data.NetworkSignature(
                input=space_utils.signature(observation_space),
                output=data.TensorSignature(shape=(1, )),
            )
Beispiel #6
0
    def _handle_env_feedback(self, agent_info, action, next_observation,
                             reward, done, env_info):
        """Handles model's mispredictions."""

        if not self._use_trainable_env:
            # We use perfect model, so there aren't any mispredictions
            # to handle.
            return
        root_parent = agent_info['node']
        true_state = self._model.obs2state(next_observation)
        solved = env_info.get('solved', False)

        # Correct mispredicted reward.
        root_parent.rewards[action] = reward

        if self._current_node.state != true_state:
            # self._model predicted wrong state, initialize new tree from
            # the true state
            new_node = self._state2node.get(true_state, None)
            if new_node is None:
                # True next state was not visited previously.
                # Initialize new GraphNode.
                if done:
                    value = self._value_traits.zero
                else:
                    # Batch stepper requires all requests submitted at the same
                    # time to have equal shape. The only other place, which
                    # sends requests, is self._expand_leaf() method, where
                    # `n_actions` observations are sent - so we do the same
                    # here.
                    #
                    # In practice: in batch stepper allow different number of
                    # observations to be sent from different agents.
                    n_actions = space_utils.max_size(self._model.action_space)
                    response = yield Request(
                        RequestType.AGENT_PREDICTION,
                        np.array([next_observation] * n_actions))
                    [value] = response[0]  # we ignore all other responses

                new_node = self._initialize_graph_node(value, true_state, done,
                                                       solved)
            # Correct mispredicted state in GraphNode, so we won't make
            # the same mistake again.
            root_parent.edges[action] = new_node
            self._current_node = new_node

        self._current_node.terminal = done

        self._current_node.solved = solved
Beispiel #7
0
 def network_signature(self, observation_space, action_space):
     return {
         data.AgentRequest: data.NetworkSignature(
             input=space.signature(observation_space),
             output=data.TensorSignature(shape=(1,)),
         ),
         data.ModelRequest: data.NetworkSignature(
             input={
                 'observation': space.signature(observation_space),
                 'action': data.TensorSignature(
                     shape=(space.max_size(action_space),)
                 ),
             },
             output={
                 'next_observation': space.signature(observation_space),
                 'reward': data.TensorSignature(shape=(1,)),
                 'done': data.TensorSignature(shape=(1,)),
             },
         )
     }
Beispiel #8
0
    def _expand_leaf(self, leaf, observation):
        leaf.children = yield from self._init_child_nodes(leaf, observation)
        for node in leaf.children:
            quality = node.quality(self._discount)
            prob = node.prior_probability
            prob_ok = prob is None or np.isscalar(prob)
            assert np.isscalar(quality) and prob_ok, (
                'Invalid shape of node quality or prior probability - expected '
                'scalars, got {} and {}. Check if your network architecture is '
                'appropriate for the observation shape.'.format(
                    quality.shape, prob.shape if prob is not None else None))

        assert len(leaf.children) == space_utils.max_size(self._action_space)

        if leaf is self._root:
            self._on_new_root(leaf)

        (child, _) = self._choose_child(leaf,
                                        exploratory=True,
                                        strict_filter=True)
        return child.quality(self._discount)
Beispiel #9
0
    def act(self, observation):
        agent_request = data.AgentRequest(observation[np.newaxis, :])

        n_actions = space.max_size(self._action_space)
        action_to_query = random.randrange(0, n_actions)

        model_request = data.ModelRequest({
            'observation': observation[np.newaxis, :],
            'action': transformations.one_hot_encode(
                [action_to_query], n_actions
            )
        })

        if not self._random_order or random.randrange(0, 2) == 0:
            agent_response = yield agent_request
            model_response = yield model_request
        else:
            model_response = yield model_request
            agent_response = yield agent_request

        assert agent_response.shape == (1, 1)
        assert data.ops.nested_map(lambda arr: arr.shape, model_response) == {
            'next_observation': (1,) + observation.shape,
            'reward': (1, 1),
            'done': (1, 1),
        }

        value = agent_response.item()
        meaningless_sum = (
                value
                + np.sum(model_response['next_observation'])
                + model_response['reward'].item()
                + model_response['done'].item()
        )
        action = int(meaningless_sum * 1e9) % n_actions

        return action, {'value': value}
Beispiel #10
0
    def solve(self, env, epoch=None, init_state=None, time_limit=None):
        yield from super().solve(env, epoch, init_state, time_limit)

        self._epoch = epoch

        model_env = env

        if time_limit is not None:
            env = envs.TimeLimitWrapper(env, time_limit)

        if init_state is None:

            observation = env.reset()
        else:

            observation = env.restore_state(init_state)

        yield from self.reset(model_env, observation)

        for callback in self._callbacks:
            callback.on_episode_begin(env, observation, epoch)

        transitions = []
        done = False
        info = {}
        while not done:

            (action, agent_info) = yield from self.act(observation)
            (next_observation, reward, done, info) = env.step(action)

            for callback in self._callbacks:
                callback.on_real_step(agent_info, action, next_observation,
                                      reward, done)

            transitions.append(
                data.Transition(
                    observation=observation,
                    action=action,
                    reward=reward,
                    done=done,
                    next_observation=next_observation,
                    agent_info=agent_info,
                ))
            observation = next_observation

        for callback in self._callbacks:
            callback.on_episode_end()

        transitions = self.postprocess_transitions(transitions)

        return_ = sum(transition.reward for transition in transitions)
        solved = info['solved'] if 'solved' in info else None
        truncated = (info['TimeLimit.truncated']
                     if 'TimeLimit.truncated' in info else None)
        transition_batch = data.nested_stack(transitions)
        action_space_size = space.max_size(model_env.action_space)
        return data.Episode(transition_batch=transition_batch,
                            return_=return_,
                            solved=solved,
                            truncated=truncated,
                            action_space_size=action_space_size)
 def params_signature(action_space):
     return data.TensorSignature(
         shape=(space_utils.max_size(action_space), ))