def test_keras_style_one_container_input_space(self):
        # Define one container input Space.
        input_space = Tuple(IntBox(3), FloatBox(shape=(4,)), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space[0])
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space[1])
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        # `inputs` is not needed  here as we only have one single input (the Tuple).
        neural_net = NeuralNetwork(outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=input_space))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = input_space.sample(4)

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", tuple([input_])), expected_outputs=expected)

        test.terminate()
    def test_keras_style_two_separate_input_spaces(self):
        # Define two input Spaces first. Independently (no container).
        input_space_1 = IntBox(3, add_batch_rank=True)
        input_space_2 = FloatBox(shape=(4,), add_batch_rank=True)

        # One-hot flatten the int tensor.
        flatten_layer_out = ReShape(flatten=True, flatten_categories=True)(input_space_1)
        # Run the float tensor through two dense layers.
        dense_1_out = DenseLayer(units=3, scope="d1")(input_space_2)
        dense_2_out = DenseLayer(units=5, scope="d2")(dense_1_out)
        # Concat everything.
        cat_out = ConcatLayer()(flatten_layer_out, dense_2_out)

        # Use the `outputs` arg to allow your network to trace back the data flow until the input space.
        neural_net = NeuralNetwork(inputs=[input_space_1, input_space_2], outputs=cat_out)

        test = ComponentTest(component=neural_net, input_spaces=dict(inputs=[input_space_1, input_space_2]))

        var_dict = neural_net.variable_registry
        w1_value = test.read_variable_values(var_dict["neural-network/d1/dense/kernel"])
        b1_value = test.read_variable_values(var_dict["neural-network/d1/dense/bias"])
        w2_value = test.read_variable_values(var_dict["neural-network/d2/dense/kernel"])
        b2_value = test.read_variable_values(var_dict["neural-network/d2/dense/bias"])

        # Batch of size=n.
        input_ = [input_space_1.sample(4), input_space_2.sample(4)]

        expected = np.concatenate([  # concat everything
            one_hot(input_[0]),  # int flattening
            dense_layer(dense_layer(input_[1], w1_value, b1_value), w2_value, b2_value)  # float -> 2 x dense
        ], axis=-1)
        out = test.test(("call", input_), expected_outputs=expected)

        test.terminate()
    def test_multi_input_stream_neural_network_with_dict(self):
        # Space must contain batch dimension (otherwise, NNlayer will complain).
        input_space = Dict(
            a=FloatBox(shape=(3,)),
            b=IntBox(4, shape=()),
            add_batch_rank=True
        )

        multi_input_nn = MultiInputStreamNeuralNetwork(
            input_network_specs=dict(
                a=[],
                b=[{"type": "reshape", "flatten": True, "flatten_categories": True}]
            ),
            post_network_spec=[{"type": "dense", "units": 2}],
        )

        test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        nn_inputs = input_space.sample(5)

        global_scope = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/"
        # Calculate output manually.
        var_dict = test.read_variable_values()

        b_flat = one_hot(nn_inputs["b"], depth=4)
        concat_out = np.concatenate((nn_inputs["a"], b_flat), axis=-1)
        expected = dense_layer(concat_out, var_dict[global_scope+"kernel"], var_dict[global_scope+"bias"])

        test.test(("call", nn_inputs), expected_outputs=expected)

        test.terminate()
    def test_multi_input_stream_neural_network_with_tuple(self):
        # Space must contain batch dimension (otherwise, NNLayer will complain).
        input_space = Tuple(
            IntBox(3, shape=()),
            FloatBox(shape=(8,)),
            IntBox(4, shape=()),
            add_batch_rank=True
        )

        multi_input_nn = MultiInputStreamNeuralNetwork(
            input_network_specs=(
                [{"type": "reshape", "flatten": True, "flatten_categories": True}],  # intbox -> flatten
                [{"type": "dense", "units": 2}],  # floatbox -> dense
                [{"type": "reshape", "flatten": True, "flatten_categories": True}]  # inbox -> flatten
            ),
            post_network_spec=[{"type": "dense", "units": 3}],
        )

        test = ComponentTest(component=multi_input_nn, input_spaces=dict(inputs=input_space))

        # Batch of size=n.
        nn_inputs = input_space.sample(3)

        global_scope_pre = "multi-input-stream-nn/input-stream-nn-"
        global_scope_post = "multi-input-stream-nn/post-concat-nn/dense-layer/dense/"
        # Calculate output manually.
        var_dict = test.read_variable_values()

        flat_0 = one_hot(nn_inputs[0], depth=3)
        dense_1 = dense_layer(
            nn_inputs[1], var_dict[global_scope_pre+"1/dense-layer/dense/kernel"],
            var_dict[global_scope_pre+"1/dense-layer/dense/bias"]
        )
        flat_2 = one_hot(nn_inputs[2], depth=4)
        concat_out = np.concatenate((flat_0, dense_1, flat_2), axis=-1)
        expected = dense_layer(concat_out, var_dict[global_scope_post+"kernel"], var_dict[global_scope_post+"bias"])

        test.test(("call", tuple([nn_inputs])), expected_outputs=expected)

        test.terminate()
Ejemplo n.º 5
0
    def test_reshape_with_flatten_option_with_0D_shape(self):
        # Test flattening int with shape=().
        in_space = IntBox(3, shape=(), add_batch_rank=True)
        reshape = ReShape(flatten=True, flatten_categories=3)
        test = ComponentTest(component=reshape,
                             input_spaces=dict(preprocessing_inputs=in_space))

        test.test("reset")
        # Time-rank=5, Batch=2
        inputs = in_space.sample(size=4)
        # Expect a by-int-category one-hot flattening.
        expected = one_hot(inputs, depth=3)
        test.test(("apply", inputs), expected_outputs=expected)
Ejemplo n.º 6
0
    def test_v_trace_function_more_complex(self):
        v_trace_function = VTraceFunction()
        v_trace_function_reference = VTraceFunction(backend="python")

        action_space = IntBox(9,
                              add_batch_rank=True,
                              add_time_rank=True,
                              time_major=True)
        action_space_flat = FloatBox(shape=(9, ),
                                     add_batch_rank=True,
                                     add_time_rank=True,
                                     time_major=True)
        input_spaces = dict(logits_actions_pi=self.time_x_batch_x_9_space,
                            log_probs_actions_mu=self.time_x_batch_x_9_space,
                            actions=action_space,
                            actions_flat=action_space_flat,
                            discounts=self.time_x_batch_x_1_space,
                            rewards=self.time_x_batch_x_1_space,
                            values=self.time_x_batch_x_1_space,
                            bootstrapped_values=self.time_x_batch_x_1_space)

        test = ComponentTest(component=v_trace_function,
                             input_spaces=input_spaces)

        size = (100, 16)
        logits_actions_pi = self.time_x_batch_x_9_space.sample(size=size)
        logits_actions_mu = self.time_x_batch_x_9_space.sample(size=size)
        log_probs_actions_mu = np.log(softmax(logits_actions_mu))
        actions = action_space.sample(size=size)
        actions_flat = one_hot(actions, depth=action_space.num_categories)
        # Set some discounts to 0.0 (these will mark the end of episodes, where the value is 0.0).
        discounts = np.random.choice([0.0, 0.99],
                                     size=size + (1, ),
                                     p=[0.1, 0.9])
        rewards = self.time_x_batch_x_1_space.sample(size=size)
        values = self.time_x_batch_x_1_space.sample(size=size)
        bootstrapped_values = self.time_x_batch_x_1_space.sample(
            size=(1, size[1]))

        input_ = [
            logits_actions_pi, log_probs_actions_mu, actions, actions_flat,
            discounts, rewards, values, bootstrapped_values
        ]

        vs_expected, pg_advantages_expected = v_trace_function_reference._graph_fn_calc_v_trace_values(
            *input_)

        test.test(("calc_v_trace_values", input_),
                  expected_outputs=[vs_expected, pg_advantages_expected],
                  decimals=4)
Ejemplo n.º 7
0
    def test_reshape_with_flatten_option_with_categories(self):
        # Test flattening while leaving batch and time rank as is, but flattening out int categories.
        in_space = IntBox(2,
                          shape=(2, 3, 4),
                          add_batch_rank=True,
                          add_time_rank=True,
                          time_major=False)
        reshape = ReShape(flatten=True, flatten_categories=2)
        test = ComponentTest(component=reshape,
                             input_spaces=dict(preprocessing_inputs=in_space))

        test.test("reset")
        # Batch=3, time-rank=5
        inputs = in_space.sample(size=(3, 5))
        expected = np.reshape(one_hot(inputs, depth=2),
                              newshape=(3, 5, 48)).astype(dtype=np.float32)
        test.test(("apply", inputs), expected_outputs=expected)
Ejemplo n.º 8
0
    def test_multi_gpu_dqn_agent_learning_test_gridworld_2x2(self):
        """
        Tests if the multi gpu strategy can learn successfully on a multi gpu system, but
        also runs on a CPU-only system using fake-GPU logic for testing purposes.
        """
        env_spec = dict(type="grid-world", world="2x2")
        dummy_env = GridWorld.from_spec(env_spec)
        agent_config = config_from_path(
            "configs/multi_gpu_dqn_for_2x2_gridworld.json")
        preprocessing_spec = agent_config.pop("preprocessing_spec")
        agent = DQNAgent.from_spec(
            agent_config,
            state_space=self.grid_world_2x2_flattened_state_space,
            action_space=dummy_env.action_space,
        )

        time_steps = 2000
        worker = SingleThreadedWorker(env_spec=env_spec,
                                      agent=agent,
                                      worker_executes_preprocessing=True,
                                      preprocessing_spec=preprocessing_spec)
        results = worker.execute_timesteps(time_steps, use_exploration=True)

        self.assertEqual(results["timesteps_executed"], time_steps)
        self.assertEqual(results["env_frames"], time_steps)
        self.assertGreaterEqual(results["mean_episode_reward"], -4.5)
        self.assertGreaterEqual(results["max_episode_reward"], 0.0)
        self.assertLessEqual(results["episodes_executed"], time_steps / 2)

        # Check all learnt Q-values.
        q_values = agent.graph_executor.execute(
            ("get_q_values", one_hot(np.array([0, 1]), depth=4)))[:]
        recursive_assert_almost_equal(q_values[0], (0.8, -5, 0.9, 0.8),
                                      decimals=1)
        recursive_assert_almost_equal(q_values[1], (0.8, 1.0, 0.9, 0.9),
                                      decimals=1)
Ejemplo n.º 9
0
    def _graph_fn_apply(self, key, preprocessing_inputs, input_before_time_rank_folding=None):
        """
        Reshapes the input to the specified new shape.

        Args:
            preprocessing_inputs (SingleDataOp): The input to reshape.
            input_before_time_rank_folding (Optional[SingleDataOp]): The original input (before!) the time-rank had
                been folded (this was done in a different ReShape Component). Serves if `self.unfold_time_rank` is True
                to figure out the exact time-rank dimension to unfold.

        Returns:
            SingleDataOp: The reshaped input.
        """
        assert self.unfold_time_rank is False or input_before_time_rank_folding is not None

        #preprocessing_inputs = tf.Print(preprocessing_inputs, [tf.shape(preprocessing_inputs)], summarize=1000,
        #                                message="input shape for {} (key={}): {}".format(preprocessing_inputs.name, key, self.scope))

        if self.backend == "python" or get_backend() == "python":
            # Create a one-hot axis for the categories at the end?
            if self.num_categories.get(key, 0) > 1:
                preprocessing_inputs = one_hot(preprocessing_inputs, depth=self.num_categories[key])

            new_shape = self.output_spaces[key].get_shape(
                with_batch_rank=-1, with_time_rank=-1, time_major=self.time_major
            )
            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            # Note: We may still flip the two, if input space has a different `time_major` than output space.
            if len(new_shape) > 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input.
                if self.unfold_time_rank is True:
                    original_shape = input_before_time_rank_folding.shape
                    new_shape = (original_shape[0], original_shape[1]) + new_shape[2:]
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = preprocessing_inputs.shape
                    # Batch and time rank stay as is.
                    if self.time_major is None or self.time_major is self.in_space_time_majors[key]:
                        new_shape = (input_shape[0], input_shape[1]) + new_shape[2:]
                    # Batch and time rank need to be flipped around: Do a transpose.
                    else:
                        preprocessing_inputs = np.transpose(preprocessing_inputs, axes=(1, 0) + input_shape[2:])
                        new_shape = (input_shape[1], input_shape[0]) + new_shape[2:]

            return np.reshape(preprocessing_inputs, newshape=new_shape)
        elif get_backend() == "pytorch":
            # Create a one-hot axis for the categories at the end?
            if self.num_categories.get(key, 0) > 1:
                preprocessing_inputs = pytorch_one_hot(preprocessing_inputs, depth=self.num_categories[key])
            new_shape = self.output_spaces[key].get_shape(
                with_batch_rank=-1, with_time_rank=-1, time_major=self.time_major
            )
            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            # Note: We may still flip the two, if input space has a different `time_major` than output space.
            if len(new_shape) > 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input.
                if self.unfold_time_rank is True:
                    original_shape = input_before_time_rank_folding.shape
                    new_shape = (original_shape[0], original_shape[1]) + new_shape[2:]
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = preprocessing_inputs.shape
                    # Batch and time rank stay as is.
                    if self.time_major is None or self.time_major is self.in_space_time_majors[key]:
                        new_shape = (input_shape[0], input_shape[1]) + new_shape[2:]
                    # Batch and time rank need to be flipped around: Do a transpose.
                    else:
                        preprocessing_inputs = torch.transpose(preprocessing_inputs, (1, 0) + input_shape[2:])
                        new_shape = (input_shape[1], input_shape[0]) + new_shape[2:]

            # print("Reshaping input of shape {} to new shape {} ".format(preprocessing_inputs.shape, new_shape))

            # The problem here is the following: Input has dim e.g. [4, 256, 1, 1]
            # -> If shape inference in spaces failed, output dim is not correct -> reshape will attempt
            # something like reshaping to [256].
            if self.flatten or (preprocessing_inputs.size(0) > 1 and preprocessing_inputs.dim() > 1):
                return preprocessing_inputs.squeeze()
            else:
                return torch.reshape(preprocessing_inputs, new_shape)

        elif get_backend() == "tf":
            # Create a one-hot axis for the categories at the end?
            if self.num_categories.get(key, 0) > 1:
                preprocessing_inputs = tf.one_hot(
                    preprocessing_inputs, depth=self.num_categories[key], axis=-1, dtype="float32"
                )

            new_shape = self.output_spaces[key].get_shape(
                with_batch_rank=-1, with_time_rank=-1, time_major=self.time_major
            )
            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            # Note: We may still flip the two, if input space has a different `time_major` than output space.
            flip_after_reshape = False
            if len(new_shape) >= 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input (and maybe flip).
                if self.unfold_time_rank is True:
                    original_shape = tf.shape(input_before_time_rank_folding)
                    new_shape = (original_shape[0], original_shape[1]) + new_shape[2:]
                    flip_after_reshape = self.flip_batch_and_time_rank
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = tf.shape(preprocessing_inputs)
                    # Batch and time rank stay as is.
                    if self.time_major is None or self.time_major is self.in_space_time_majors[key]:
                        new_shape = (input_shape[0], input_shape[1]) + new_shape[2:]
                    # Batch and time rank need to be flipped around: Do a transpose.
                    else:
                        assert self.flip_batch_and_time_rank is True
                        preprocessing_inputs = tf.transpose(
                            preprocessing_inputs, perm=(1, 0) + tuple(i for i in range(
                                2, input_shape.shape.as_list()[0]
                            )), name="transpose-flip-batch-time-ranks"
                        )
                        new_shape = (input_shape[1], input_shape[0]) + new_shape[2:]

            reshaped = tf.reshape(tensor=preprocessing_inputs, shape=new_shape, name="reshaped")

            if flip_after_reshape and self.flip_batch_and_time_rank:
                reshaped = tf.transpose(reshaped, (1, 0) + tuple(i for i in range(2, len(new_shape))), name="transpose-flip-batch-time-ranks-after-reshape")

            #reshaped = tf.Print(reshaped, [tf.shape(reshaped)], summarize=1000,
            #                    message="output shape for {} (key={}): {}".format(reshaped, key, self.scope))

            # Have to place the time rank back in as unknown (for the auto Space inference).
            if type(self.unfold_time_rank) == int:
                # TODO: replace placeholder with default value by _batch_rank/_time_rank properties.
                return tf.placeholder_with_default(reshaped, shape=(None, None) + new_shape[2:])
            else:
                # TODO: add other cases of reshaping and fix batch/time rank hints.
                if self.fold_time_rank:
                    reshaped._batch_rank = 0
                elif self.unfold_time_rank or self.flip_batch_and_time_rank:
                    reshaped._batch_rank = 0 if self.time_major is False else 1
                    reshaped._time_rank = 0 if self.time_major is True else 1

                return reshaped
Ejemplo n.º 10
0
    def _graph_fn_loss_per_item(self,
                                key,
                                td_targets,
                                q_values_s,
                                actions,
                                importance_weights=None):
        """
        Args:
            td_targets (SingleDataOp): The already calculated TD-target terms (r + gamma maxa'Qt(s',a')
                OR for double Q: r + gamma Qt(s',argmaxa'(Q(s',a'))))

            q_values_s (SingleDataOp): The batch of Q-values representing the expected accumulated discounted returns
                when in s and taking different actions a.

            actions (SingleDataOp): The batch of actions that were actually taken in states s (from a memory).

            importance_weights (Optional[SingleDataOp]): If 'self.importance_weights' is True: The batch of weights to
                apply to the losses.

        Returns:
            SingleDataOp: The loss values vector (one single value for each batch item).
        """
        # Numpy backend primarily for testing purposes.
        if self.backend == "python" or get_backend() == "python":
            from rlgraph.utils.numpy import one_hot

            actions_one_hot = one_hot(
                actions, depth=self.flat_action_space[key].num_categories)
            q_s_a_values = np.sum(q_values_s * actions_one_hot, axis=-1)

            td_delta = td_targets - q_s_a_values

            if td_delta.ndim > 1:
                if self.importance_weights:
                    td_delta = np.mean(td_delta * importance_weights,
                                       axis=list(
                                           range(1, self.ranks_to_reduce + 1)))

                else:
                    td_delta = np.mean(td_delta,
                                       axis=list(
                                           range(1, self.ranks_to_reduce + 1)))

        elif get_backend() == "tf":
            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = tf.one_hot(
                indices=actions,
                depth=self.flat_action_space[key].num_categories)
            q_s_a_values = tf.reduce_sum(input_tensor=(q_values_s * one_hot),
                                         axis=-1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = td_targets - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = tf.reduce_mean(input_tensor=td_delta,
                                          axis=list(
                                              range(1,
                                                    self.ranks_to_reduce + 1)))

        elif get_backend() == "pytorch":
            # Add batch dim in case of single sample.
            if q_values_s.dim() == 1:
                q_values_s = q_values_s.unsqueeze(-1)
                actions = actions.unsqueeze(-1)
                if self.importance_weights:
                    importance_weights = importance_weights.unsqueeze(-1)

            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = pytorch_one_hot(
                actions, depth=self.flat_action_space[key].num_categories)
            q_s_a_values = torch.sum((q_values_s * one_hot), -1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = td_targets - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = torch.mean(td_delta,
                                      tuple(range(1,
                                                  self.ranks_to_reduce + 1)),
                                      keepdim=False)

        # Apply importance-weights from a prioritized replay to the loss.
        if self.importance_weights:
            return importance_weights * td_delta
        else:
            return td_delta
Ejemplo n.º 11
0
    def _graph_fn_get_td_targets(self,
                                 key,
                                 rewards,
                                 terminals,
                                 qt_values_sp,
                                 q_values_sp=None):
        """
        Args:
            rewards (SingleDataOp): The batch of rewards that we received after having taken a in s (from a memory).
            terminals (SingleDataOp): The batch of terminal signals that we received after having taken a in s
                (from a memory).
            qt_values_sp (SingleDataOp): The batch of Q-values representing the expected accumulated discounted
                returns (estimated by the target net) when in s' and taking different actions a'.
            q_values_sp (Optional[SingleDataOp]): If `self.double_q` is True: The batch of Q-values representing the
                expected accumulated discounted returns (estimated by the (main) policy net) when in s' and taking
                different actions a'.

        Returns:
            SingleDataOp: The target values vector.
        """
        qt_sp_ap_values = None

        # Numpy backend primarily for testing purposes.
        if self.backend == "python" or get_backend() == "python":
            from rlgraph.utils.numpy import one_hot
            if self.double_q:
                a_primes = np.argmax(q_values_sp, axis=-1)
                a_primes_one_hot = one_hot(
                    a_primes, depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = np.sum(qt_values_sp * a_primes_one_hot,
                                         axis=-1)
            else:
                qt_sp_ap_values = np.max(qt_values_sp, axis=-1)

            for _ in range(qt_sp_ap_values.ndim - 1):
                rewards = np.expand_dims(rewards, axis=1)

            qt_sp_ap_values = np.where(terminals,
                                       np.zeros_like(qt_sp_ap_values),
                                       qt_sp_ap_values)

        elif get_backend() == "tf":
            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = tf.stop_gradient(qt_values_sp)

            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = tf.argmax(input=q_values_sp, axis=-1)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = tf.one_hot(
                    indices=a_primes,
                    depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = tf.reduce_sum(input_tensor=(qt_values_sp *
                                                              one_hot),
                                                axis=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = tf.reduce_max(input_tensor=qt_values_sp,
                                                axis=-1)

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = tf.expand_dims(rewards, axis=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            qt_sp_ap_values = tf.where(condition=terminals,
                                       x=tf.zeros_like(qt_sp_ap_values),
                                       y=qt_sp_ap_values)

        elif get_backend() == "pytorch":
            if not isinstance(terminals, torch.ByteTensor):
                terminals = terminals.byte()
            # Add batch dim in case of single sample.
            if qt_values_sp.dim() == 1:
                rewards = rewards.unsqueeze(-1)
                terminals = terminals.unsqueeze(-1)
                q_values_sp = q_values_sp.unsqueeze(-1)
                qt_values_sp = qt_values_sp.unsqueeze(-1)

            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = qt_values_sp.detach()
            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = torch.argmax(q_values_sp, dim=-1, keepdim=True)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = pytorch_one_hot(
                    a_primes, depth=self.flat_action_space[key].num_categories)
                qt_sp_ap_values = torch.sum(qt_values_sp * one_hot.squeeze(),
                                            dim=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = torch.max(qt_values_sp, -1)[0]

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = torch.unsqueeze(rewards, dim=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            # torch.where cannot broadcast here, so tile and reshape to same shape.
            if qt_sp_ap_values.dim() > 1:
                num_tiles = np.prod(qt_sp_ap_values.shape[1:])
                terminals = pytorch_tile(terminals, num_tiles,
                                         -1).reshape(qt_sp_ap_values.shape)
            qt_sp_ap_values = torch.where(terminals,
                                          torch.zeros_like(qt_sp_ap_values),
                                          qt_sp_ap_values)
        td_targets = (rewards + (self.discount**self.n_step) * qt_sp_ap_values)
        return td_targets
Ejemplo n.º 12
0
    def _graph_fn_apply(self,
                        key,
                        preprocessing_inputs,
                        input_before_time_rank_folding=None):
        """
        Reshapes the input to the specified new shape.

        Args:
            preprocessing_inputs (SingleDataOp): The input to reshape.
            input_before_time_rank_folding (Optional[SingleDataOp]): The original input (before!) the time-rank had
                been folded (this was done in a different ReShape Component). Serves if `self.unfold_time_rank` is True
                to figure out the exact time-rank dimension to unfold.

        Returns:
            SingleDataOp: The reshaped input.
        """
        assert self.unfold_time_rank is False or input_before_time_rank_folding is not None

        if self.backend == "python" or get_backend() == "python":
            # Create a one-hot axis for the categories at the end?
            num_categories = self.get_num_categories(
                key, get_space_from_op(preprocessing_inputs))
            if num_categories and num_categories > 1:
                preprocessing_inputs = one_hot(preprocessing_inputs,
                                               depth=num_categories)

            if self.unfold_time_rank:
                new_shape = (-1, -1) + preprocessing_inputs.shape[1:]
            elif self.fold_time_rank:
                new_shape = (-1, ) + preprocessing_inputs.shape[2:]
            else:
                new_shape = self.get_preprocessed_space(
                    get_space_from_op(preprocessing_inputs)).get_shape(
                        with_batch_rank=-1, with_time_rank=-1)

            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            if len(preprocessing_inputs.shape
                   ) > 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input.
                if self.unfold_time_rank is True:
                    original_shape = input_before_time_rank_folding.shape
                    new_shape = (original_shape[0],
                                 original_shape[1]) + new_shape[2:]
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = preprocessing_inputs.shape
                    # Batch and time rank stay as is.
                    new_shape = (input_shape[0],
                                 input_shape[1]) + new_shape[2:]

            return np.reshape(preprocessing_inputs, newshape=new_shape)

        elif get_backend() == "pytorch":
            # Create a one-hot axis for the categories at the end?
            num_categories = self.get_num_categories(
                key, get_space_from_op(preprocessing_inputs))
            if num_categories and num_categories > 1:
                preprocessing_inputs = pytorch_one_hot(preprocessing_inputs,
                                                       depth=num_categories)

            if self.unfold_time_rank:
                new_shape = (-1, -1) + preprocessing_inputs.shape[1:]
            elif self.fold_time_rank:
                new_shape = (-1, ) + preprocessing_inputs.shape[2:]
            else:
                new_shape = self.get_preprocessed_space(
                    get_space_from_op(preprocessing_inputs)).get_shape(
                        with_batch_rank=-1, with_time_rank=-1)

            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            if len(new_shape
                   ) > 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input.
                if self.unfold_time_rank is True:
                    original_shape = input_before_time_rank_folding.shape
                    new_shape = (original_shape[0],
                                 original_shape[1]) + new_shape[2:]
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = preprocessing_inputs.shape
                    # Batch and time rank stay as is.
                    new_shape = (input_shape[0],
                                 input_shape[1]) + new_shape[2:]

            # print("Reshaping input of shape {} to new shape {} (flatten = {})".format(preprocessing_inputs.shape,
            #                                                                           new_shape, self.flatten))

            old_size = np.prod(list(preprocessing_inputs.shape))
            new_size = np.prod(new_shape)

            # The problem here is the following: Input has dim e.g. [4, 256, 1, 1]
            # -> If shape inference in spaces failed, output dim is not correct -> reshape will attempt
            # something like reshaping to [256].
            if self.flatten and preprocessing_inputs.dim() > 1:
                flattened_shape_without_batchrank = np.prod(
                    preprocessing_inputs.shape[1:])
                flattened_shape = (preprocessing_inputs.shape[0], ) + (
                    flattened_shape_without_batchrank, )
                return torch.reshape(preprocessing_inputs, flattened_shape)
            # If new shape does not fit into old shape, batch inference failed -> try to restore:
            # Equal except batch rank -> return as is:
            elif old_size != new_size:
                if tuple(preprocessing_inputs.shape[1:]) == new_shape:
                    return preprocessing_inputs
                else:
                    # Attempt to rescue reshape by combining new shape with batch dim.
                    full_new_shape = (
                        preprocessing_inputs.shape[0], ) + new_shape
                    return torch.reshape(preprocessing_inputs, full_new_shape)
            else:
                return torch.reshape(preprocessing_inputs, new_shape)

        elif get_backend() == "tf":
            # Create a one-hot axis for the categories at the end?
            space = get_space_from_op(preprocessing_inputs)
            num_categories = self.get_num_categories(key, space)
            if num_categories and num_categories > 1:
                preprocessing_inputs_ = tf.one_hot(preprocessing_inputs,
                                                   depth=num_categories,
                                                   axis=-1,
                                                   dtype="float32")
                if hasattr(preprocessing_inputs, "_batch_rank"):
                    preprocessing_inputs_._batch_rank = preprocessing_inputs._batch_rank
                if hasattr(preprocessing_inputs, "_time_rank"):
                    preprocessing_inputs_._time_rank = preprocessing_inputs._time_rank
                preprocessing_inputs = preprocessing_inputs_

            if self.unfold_time_rank:
                list_shape = preprocessing_inputs.shape.as_list()
                assert len(list_shape) == 1 or list_shape[1] is not None,\
                    "ERROR: Cannot unfold. `preprocessing_inputs` (with shape {}) " \
                    "already seems to be unfolded!".format(list_shape)
                new_shape = (-1, -1) + tuple(list_shape[1:])
            elif self.fold_time_rank:
                new_shape = (-1, ) + tuple(
                    preprocessing_inputs.shape.as_list()[2:])
            else:
                new_shape = self.get_preprocessed_space(
                    get_space_from_op(preprocessing_inputs)).get_shape(
                        with_batch_rank=-1, with_time_rank=-1)

            # Dynamic new shape inference:
            # If both batch and time rank must be left alone OR the time rank must be unfolded from a currently common
            # batch+time 0th rank, get these two dynamically.
            if len(new_shape
                   ) >= 2 and new_shape[0] == -1 and new_shape[1] == -1:
                # Time rank unfolding. Get the time rank from original input.
                if self.unfold_time_rank is True:
                    original_shape = tf.shape(input_before_time_rank_folding)
                    new_shape = (original_shape[0],
                                 original_shape[1]) + new_shape[2:]
                # No time-rank unfolding, but we do have both batch- and time-rank.
                else:
                    input_shape = tf.shape(preprocessing_inputs)
                    # Batch and time rank stay as is.
                    new_shape = (input_shape[0],
                                 input_shape[1]) + new_shape[2:]

            reshaped = tf.reshape(tensor=preprocessing_inputs,
                                  shape=new_shape,
                                  name="reshaped")

            # Have to place the time rank back in as unknown (for the auto Space inference).
            if type(self.unfold_time_rank) == int:
                # TODO: replace placeholder with default value by _batch_rank/_time_rank properties.
                return tf.placeholder_with_default(reshaped,
                                                   shape=(None, None) +
                                                   new_shape[2:])
            else:
                # TODO: add other cases of reshaping and fix batch/time rank hints.
                if self.fold_time_rank:
                    reshaped._batch_rank = 0
                elif self.unfold_time_rank:
                    reshaped._batch_rank = 1 if self.time_major is True else 0
                    reshaped._time_rank = 0 if self.time_major is True else 1
                else:
                    if space.has_batch_rank is True:
                        if space.time_major is False:
                            reshaped._batch_rank = 0
                        else:
                            reshaped._time_rank = 0
                            reshaped._batch_rank = 1
                    if space.has_time_rank is True:
                        reshaped._time_rank = 0 if space.time_major is True else 1

                return reshaped
Ejemplo n.º 13
0
    def _graph_fn_loss_per_item(self,
                                q_values_s,
                                actions,
                                rewards,
                                terminals,
                                qt_values_sp,
                                q_values_sp=None,
                                importance_weights=None):
        """
        Args:
            q_values_s (SingleDataOp): The batch of Q-values representing the expected accumulated discounted returns
                when in s and taking different actions a.
            actions (SingleDataOp): The batch of actions that were actually taken in states s (from a memory).
            rewards (SingleDataOp): The batch of rewards that we received after having taken a in s (from a memory).
            terminals (SingleDataOp): The batch of terminal signals that we received after having taken a in s
                (from a memory).
            qt_values_sp (SingleDataOp): The batch of Q-values representing the expected accumulated discounted
                returns (estimated by the target net) when in s' and taking different actions a'.
            q_values_sp (Optional[SingleDataOp]): If `self.double_q` is True: The batch of Q-values representing the
                expected accumulated discounted returns (estimated by the (main) policy net) when in s' and taking
                different actions a'.
            importance_weights (Optional[SingleDataOp]): If 'self.importance_weights' is True: The batch of weights to
                apply to the losses.

        Returns:
            SingleDataOp: The loss values vector (one single value for each batch item).
        """
        # Numpy backend primarily for testing purposes.
        if self.backend == "python" or get_backend() == "python":
            from rlgraph.utils.numpy import one_hot
            if self.double_q:
                a_primes = np.argmax(q_values_sp, axis=-1)
                a_primes_one_hot = one_hot(
                    a_primes, depth=self.action_space.num_categories)
                qt_sp_ap_values = np.sum(qt_values_sp * a_primes_one_hot,
                                         axis=-1)
            else:
                qt_sp_ap_values = np.max(qt_values_sp, axis=-1)

            for _ in range(qt_sp_ap_values.ndim - 1):
                rewards = np.expand_dims(rewards, axis=1)

            qt_sp_ap_values = np.where(terminals,
                                       np.zeros_like(qt_sp_ap_values),
                                       qt_sp_ap_values)

            actions_one_hot = one_hot(actions,
                                      depth=self.action_space.num_categories)
            q_s_a_values = np.sum(q_values_s * actions_one_hot, axis=-1)

            td_delta = (
                rewards +
                (self.discount**self.n_step) * qt_sp_ap_values) - q_s_a_values

            if td_delta.ndim > 1:
                if self.importance_weights:
                    td_delta = np.mean(td_delta * importance_weights,
                                       axis=list(
                                           range(1, self.ranks_to_reduce + 1)))

                else:
                    td_delta = np.mean(td_delta,
                                       axis=list(
                                           range(1, self.ranks_to_reduce + 1)))

            return self._apply_huber_loss_if_necessary(td_delta)
        elif get_backend() == "tf":
            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = tf.stop_gradient(qt_values_sp)

            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = tf.argmax(input=q_values_sp, axis=-1)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = tf.one_hot(indices=a_primes,
                                     depth=self.action_space.num_categories)
                qt_sp_ap_values = tf.reduce_sum(input_tensor=(qt_values_sp *
                                                              one_hot),
                                                axis=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = tf.reduce_max(input_tensor=qt_values_sp,
                                                axis=-1)

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = tf.expand_dims(rewards, axis=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            qt_sp_ap_values = tf.where(condition=terminals,
                                       x=tf.zeros_like(qt_sp_ap_values),
                                       y=qt_sp_ap_values)

            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = tf.one_hot(indices=actions,
                                 depth=self.action_space.num_categories)
            q_s_a_values = tf.reduce_sum(input_tensor=(q_values_s * one_hot),
                                         axis=-1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = (
                rewards +
                (self.discount**self.n_step) * qt_sp_ap_values) - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = tf.reduce_mean(input_tensor=td_delta,
                                          axis=list(
                                              range(1,
                                                    self.ranks_to_reduce + 1)))

            # Apply importance-weights from a prioritized replay to the loss.
            if self.importance_weights:
                return importance_weights * self._apply_huber_loss_if_necessary(
                    td_delta)
            else:
                return self._apply_huber_loss_if_necessary(td_delta)
        elif get_backend() == "pytorch":
            if not isinstance(terminals, torch.ByteTensor):
                terminals = terminals.byte()
            # Add batch dim in case of single sample.
            if q_values_s.dim() == 1:
                q_values_s = q_values_s.unsqueeze(-1)
                actions = actions.unsqueeze(-1)
                rewards = rewards.unsqueeze(-1)
                terminals = terminals.unsqueeze(-1)
                q_values_sp = q_values_sp.unsqueeze(-1)
                qt_values_sp = qt_values_sp.unsqueeze(-1)
                if self.importance_weights:
                    importance_weights = importance_weights.unsqueeze(-1)

            # Make sure the target policy's outputs are treated as constant when calculating gradients.
            qt_values_sp = qt_values_sp.detach()
            if self.double_q:
                # For double-Q, we no longer use the max(a')Qt(s'a') value.
                # Instead, the a' used to get the Qt(s'a') is given by argmax(a') Q(s',a') <- Q=q-net, not target net!
                a_primes = torch.argmax(q_values_sp, dim=-1, keepdim=True)

                # Now lookup Q(s'a') with the calculated a'.
                one_hot = pytorch_one_hot(
                    a_primes, depth=self.action_space.num_categories)
                qt_sp_ap_values = torch.sum(qt_values_sp * one_hot, dim=-1)
            else:
                # Qt(s',a') -> Use the max(a') value (from the target network).
                qt_sp_ap_values = torch.max(qt_values_sp)

            # Make sure the rewards vector (batch) is broadcast correctly.
            for _ in range(get_rank(qt_sp_ap_values) - 1):
                rewards = torch.unsqueeze(rewards, dim=1)

            # Ignore Q(s'a') values if s' is a terminal state. Instead use 0.0 as the state-action value for s'a'.
            # Note that in that case, the next_state (s') is not the correct next state and should be disregarded.
            # See Chapter 3.4 in "RL - An Introduction" (2017 draft) by A. Barto and R. Sutton for a detailed analysis.
            qt_sp_ap_values = torch.where(terminals,
                                          torch.zeros_like(qt_sp_ap_values),
                                          qt_sp_ap_values)
            # Q(s,a) -> Use the Q-value of the action actually taken before.
            one_hot = pytorch_one_hot(actions,
                                      depth=self.action_space.num_categories)
            q_s_a_values = torch.sum((q_values_s * one_hot), -1)

            # Calculate the TD-delta (target - current estimate).
            td_delta = (
                rewards +
                (self.discount**self.n_step) * qt_sp_ap_values) - q_s_a_values

            # Reduce over the composite actions, if any.
            if get_rank(td_delta) > 1:
                td_delta = pytorch_reduce_mean(
                    td_delta,
                    list(range(1, self.ranks_to_reduce + 1)),
                    keepdims=False)

            # Apply importance-weights from a prioritized replay to the loss.
            if self.importance_weights:
                return importance_weights * self._apply_huber_loss_if_necessary(
                    td_delta)
            else:
                return self._apply_huber_loss_if_necessary(td_delta)