Ejemplos de Policy en Python, ejemplos de rlgraph.components.policies.policy.Policy en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: actor_component.py Proyecto: Cohencohenchen/rlgraph

    def __init__(self,
                 preprocessor_spec,
                 policy_spec,
                 exploration_spec=None,
                 **kwargs):
        """
        Args:
            preprocessor_spec (Union[list,dict,PreprocessorSpec]):
                - A dict if the state from the Env will come in as a ContainerSpace (e.g. Dict). In this case, each
                    each key in this dict specifies, which value in the incoming dict should go through which PreprocessorStack.
                - A list with layer specs.
                - A PreprocessorStack object.

            policy_spec (Union[dict,Policy]): A specification dict for a Policy object or a Policy object directly.

            exploration_spec (Union[dict,Exploration]): A specification dict for an Exploration object or an Exploration
                object directly.
        """
        super(ActorComponent,
              self).__init__(scope=kwargs.pop("scope", "actor-component"),
                             **kwargs)

        self.preprocessor = PreprocessorStack.from_spec(preprocessor_spec)
        self.policy = Policy.from_spec(policy_spec)
        self.num_nn_inputs = self.policy.neural_network.num_inputs
        self.exploration = Exploration.from_spec(exploration_spec)

        self.tuple_merger = ContainerMerger(is_tuple=True,
                                            merge_tuples_into_one=True)
        self.tuple_splitter = ContainerSplitter(
            tuple_length=self.num_nn_inputs)

        self.add_components(self.policy, self.exploration, self.preprocessor,
                            self.tuple_merger, self.tuple_splitter)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: test_actor_components.py Proyecto: EmpereurCC/RLgraph_exp

    def test_actor_component_with_dict_preprocessor(self):
        # state_space (a complex Dict Space, that will be partially preprocessed).
        state_space = Dict(
            a=FloatBox(shape=(2,)),
            b=FloatBox(shape=(5,)),
            add_batch_rank=True
        )
        # action_space.
        action_space = IntBox(2, add_batch_rank=True)

        preprocessor_spec = dict(
            type="dict-preprocessor-stack",
            preprocessors=dict(
                a=[
                    dict(type="convert_type", to_dtype="float"),
                    dict(type="multiply", factor=0.5)
                ]
            )
        )
        # Simple custom NN with dict input (splits into 2 streams (simple dense layers) and concats at the end).
        policy = Policy(network_spec=DummyNNWithDictInput(num_units_a=2, num_units_b=3, scope="dummy-nn"),
                        action_space=action_space)
        exploration = None  # no exploration

        actor_component = ActorComponent(preprocessor_spec, policy, exploration)

        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(states=state_space),
            action_space=action_space
        )

        # Some state inputs (batch size=4).
        states = state_space.sample(size=4)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(actor_component.variable_registry)
        # Expected NN-output.
        expected_nn_output_stream_a = np.matmul(
            states["a"] * 0.5, actor_component_params["actor-component/policy/dummy-nn/dense-a/dense/kernel"]
        )
        expected_nn_output_stream_b = np.matmul(
            states["b"], actor_component_params["actor-component/policy/dummy-nn/dense-b/dense/kernel"]
        )
        expected_nn_output = np.concatenate((expected_nn_output_stream_a, expected_nn_output_stream_b), axis=-1)

        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output,
            actor_component_params["actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = dict(a=states["a"] * 0.5, b=states["b"])
        test.test(
            ("get_preprocessed_state_and_action", states),
            expected_outputs=dict(
                preprocessed_state=expected_preprocessed_state, action=expected_actions,
                nn_outputs=expected_nn_output
            )
        )

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_actor_components.py Proyecto: EmpereurCC/RLgraph_exp

    def test_actor_component_with_lstm_network(self):
        # state space and internal state space
        state_space = FloatBox(shape=(2,), add_batch_rank=True, add_time_rank=True, time_major=False)
        internal_states_space = Tuple(FloatBox(shape=(3,)), FloatBox(shape=(3,)), add_batch_rank=True)
        time_percentages_space = FloatBox()
        # action_space.
        action_space = IntBox(2, add_batch_rank=True, add_time_rank=True)

        preprocessor = PreprocessorStack.from_spec(
            [dict(type="convert_type", to_dtype="float"), dict(type="divide", divisor=10)]
        )
        policy = Policy(network_spec=config_from_path("configs/test_lstm_nn.json"), action_space=action_space)
        exploration = Exploration(epsilon_spec=dict(decay_spec=dict(
            type="linear_decay", from_=1.0, to_=0.1)
        ))
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(
            component=actor_component,
            input_spaces=dict(
                states=state_space,
                other_nn_inputs=Tuple(internal_states_space, add_batch_rank=True),
                time_percentage=time_percentages_space
            ),
            action_space=action_space
        )
        # Some state inputs (batch size=2, seq-len=1000; batch-major).
        np.random.seed(10)
        states = state_space.sample(size=(1000, 2))
        initial_internal_states = internal_states_space.zeros(size=2)  # only batch
        time_percentages = time_percentages_space.sample(1000)

        # Run n times a single time-step to simulate acting and env interaction with an LSTM.
        preprocessed_states = np.ndarray(shape=(1000, 2, 2), dtype=np.float)
        actions = np.ndarray(shape=(1000, 2, 1), dtype=np.int)
        for i, time_percentage in enumerate(time_percentages):
            ret = test.test((
                "get_preprocessed_state_and_action",
                # expand time dim at 1st slot as we are time-major == False
                [np.expand_dims(states[i], 1), tuple([initial_internal_states]), time_percentage]
            ))
            preprocessed_states[i] = ret["preprocessed_state"][:, 0, :]  # take out time-rank again ()
            actions[i] = ret["action"]
            # Check c/h-state shape.
            self.assertEqual(ret["nn_outputs"][1][0].shape, (2, 3))  # batch-size=2, LSTM units=3
            self.assertEqual(ret["nn_outputs"][1][1].shape, (2, 3))

        # Check all preprocessed states (easy: just divided by 10).
        expected_preprocessed_state = states / 10
        recursive_assert_almost_equal(preprocessed_states, expected_preprocessed_state)

        # Check the exploration functionality over the actions.
        # Not checking mean as we are mostly in the non-exploratory region, that's why the stddev should be small.
        stddev_actions = actions.std()
        self.assertGreater(stddev_actions, 0.4)
        self.assertLess(stddev_actions, 0.6)

Ejemplo n.º 4

0

Mostrar archivo

    def __init__(self,
                 policy_spec,
                 minimum_batch_size=1,
                 maximum_batch_size=1024,
                 timeout_ms=100,
                 scope="dynamic-batching-policy",
                 **kwargs):
        """
        Args:
            policy_spec (Union[Optimizer,dict]): A spec dict to construct the Policy that is wrraped by this
                DynamicBatchingPolicy or a Policy object directly.
            minimum_batch_size (int): The minimum batch size to use. Default: 1.
            maximum_batch_size (int): The maximum batch size to use. Default: 1024
            timeout_ms (int): The time out in ms to use when waiting for items on the queue.
                Default: 100ms.
        """
        super(DynamicBatchingPolicy, self).__init__(
            # 3=states, logits, internal_states
            graph_fn_num_outputs=dict(
                _graph_fn_get_state_values_logits_probabilities_log_probs=5),
            scope=scope,
            **kwargs)

        # The wrapped, backend-specific policy object.
        self.policy = Policy.from_spec(policy_spec)

        # hack: link in case parent components call APIs of the distribution directly
        self.action_adapter = self.policy.action_adapter
        self.distribution = self.policy.distribution
        self.deterministic = True

        # Dynamic batching options.
        self.minimum_batch_size = minimum_batch_size
        self.maximum_batch_size = maximum_batch_size
        self.timeout_ms = timeout_ms

        self.add_components(self.policy)

        # TODO: for now, only define this one API-method as this is the only one used in IMPALA.
        # TODO: Generalize this component so it can wrap arbitrary other components and simulate their API.
        self.define_api_method(
            "get_state_values_logits_probabilities_log_probs",
            self._graph_fn_get_state_values_logits_probabilities_log_probs)

Ejemplo n.º 5

0

Mostrar archivo

    def test_simple_actor_component(self):
        # state_space (NN is a simple single fc-layer relu network (2 units), random biases, random weights).
        state_space = FloatBox(shape=(5, ), add_batch_rank=True)
        # action_space.
        action_space = IntBox(10)

        preprocessor = PreprocessorStack.from_spec([
            dict(type="convert_type", to_dtype="float"),
            dict(type="multiply", factor=2)
        ])
        policy = Policy(
            network_spec=config_from_path("configs/test_simple_nn.json"),
            action_space=action_space)
        exploration = Exploration()  # no exploration
        actor_component = ActorComponent(preprocessor, policy, exploration)
        test = ComponentTest(component=actor_component,
                             input_spaces=dict(states=state_space),
                             action_space=action_space)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)

        # Some state inputs (5 input nodes, batch size=2).
        states = state_space.sample(2)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_and_action", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions))

        # Get actions and action-probs by calling a different API-method.
        states = state_space.sample(5)
        # Get and check some actions.
        actor_component_params = test.read_variable_values(
            actor_component.variables)
        # Expected NN-output.
        expected_nn_output = np.matmul(
            states * 2, actor_component_params[
                "actor-component/policy/test-network/hidden-layer/dense/kernel"]
        )
        # Raw action layer output.
        expected_action_layer_output = np.matmul(
            expected_nn_output, actor_component_params[
                "actor-component/policy/action-adapter-0/action-network/action-layer/dense/kernel"]
        )
        # No reshape necessary (simple action space), softmax to get probs.
        expected_action_probs = softmax(expected_action_layer_output)
        # Final actions (max-likelihood/greedy pick).
        expected_actions = np.argmax(expected_action_layer_output, axis=-1)
        expected_preprocessed_state = states * 2
        test.test(("get_preprocessed_state_action_and_action_probs", states),
                  expected_outputs=dict(
                      preprocessed_state=expected_preprocessed_state,
                      action=expected_actions,
                      action_probs=expected_action_probs))