Ejemplo n.º 1
0
    def recurrent_inference(
            self, latent_state: np.ndarray,
            action: int) -> typing.Tuple[float, np.ndarray, np.ndarray, float]:
        """
        Combines the prediction and dynamics implementations into one call. This reduces overhead.

        Integer actions are encoded to one-hot-encoded vectors. Both the latent state and action vector are
        padded with a batch-size dimensions of length 1. Inferred reward and state value values are
        cast from their distributional bins into scalars.

        :param latent_state: A neural encoding of the environment at step k: s_k.
        :param action: A (encoded) action to perform on the latent state
        :return: A tuple with predictions of the following form:
            r: The immediate predicted reward of the environment.
            s_(k+1): A new 'latent_state' resulting from performing the 'action' in the latent_state.
            pi: a policy vector for the provided state - a numpy array of length |action_space|.
            v: a float that gives the state value estimate of the provided state.
        """
        # One hot encode integer actions.
        a_plane = np.zeros(self.action_size)
        a_plane[action] = 1

        # Pad batch dimension
        latent_state = latent_state[np.newaxis, ...]
        a_plane = a_plane[np.newaxis, ...]

        r, s_next, pi, v = self.neural_net.recurrent.predict(
            [latent_state, a_plane])

        # Cast bins to scalar
        v_real = support_to_scalar(v, self.net_args.support_size)
        r_real = support_to_scalar(r, self.net_args.support_size)

        return np.ndarray.item(r_real), s_next[0], pi[0], np.ndarray.item(
            v_real)
Ejemplo n.º 2
0
    def log_batch(self, data_batch: typing.List) -> None:
        """
        Log a large amount of neural network statistics based on the given batch.
        Functionality can be toggled on by specifying '--debug' as a console argument to Main.py.
        Note: toggling this functionality on will produce significantly larger tensorboard event files!

        Statistics include:
         - Priority sampling sample probabilities.
         - Values of each target/ prediction for the data batch.
         - Loss discrepancy between cross-entropy and MSE for the reward/ value predictions.
        """
        if DEBUG_MODE and self.reference.steps % LOG_RATE == 0:
            observations, targets, sample_weight = list(zip(*data_batch))
            target_pis, target_vs = list(map(np.asarray, zip(*targets)))
            observations = np.asarray(observations)

            priority = sample_weight * len(data_batch)  # Undo 1/n scaling to get priority
            tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps)

            pis, vs = self.reference.neural_net.model.predict_on_batch(observations)
            v_reals = support_to_scalar(vs, self.reference.net_args.support_size).ravel()  # as scalars

            tf.summary.histogram(f"v_targets", data=target_vs, step=self.reference.steps)
            tf.summary.histogram(f"v_predict", data=v_reals, step=self.reference.steps)

            mse = np.mean((v_reals - target_vs) ** 2)
            tf.summary.scalar("v_mse", data=mse, step=self.reference.steps)
Ejemplo n.º 3
0
    def initial_inference(
        self, observations: np.ndarray
    ) -> typing.Tuple[np.ndarray, np.ndarray, float]:
        """
        Combines the prediction and representation implementations into one call. This reduces
        overhead and results in a significant speed up.

        The observation array is padded with a batch-size dimension of length 1. The inferred state value is
        cast from its distributional bins into a scalar.

        :param observations: A game specific (stacked) tensor of observations of the environment at step t: o_t.
        :return: A tuple with predictions of the following form:
            s_(0): The root 'latent_state' produced by the representation function
            pi: a policy vector for the provided state - a numpy array of length |action_space|.
            v: a float that gives the state value estimate of the provided state.
        """
        # Pad batch dimension
        observations = observations[np.newaxis, ...]

        s_0, pi, v = self.neural_net.forward.predict(observations)

        # Cast bins to scalar
        v_real = support_to_scalar(v, self.net_args.support_size)

        return s_0[0], pi[0], np.ndarray.item(v_real)
Ejemplo n.º 4
0
    def test_reward_distribution_transformation(self):
        bins = 300  # Ensure that bins is large enough to support 'high'.
        n = 10  # Number of samples to draw
        high = 1e3  # Factor to scale the randomly generated rewards

        # Generate some random (large) values
        scalars = np.random.randn(n) * high

        # Cast scalars to support points of a categorical distribution.
        support = scalar_to_support(scalars, bins)

        # Ensure correct dimensionality
        self.assertEqual(support.shape, (n, bins * 2 + 1))

        # Cast support points back to scalars.
        inverted = support_to_scalar(support, bins)

        # Ensure correct dimensionality
        self.assertEqual(inverted.shape, scalars.shape)

        # Scalar to support and back to scalars should be equal.
        np.testing.assert_array_almost_equal(scalars, inverted)

        # Test bin creation explicitly against manually calculated example.
        scalars = [-2.5, -0.75, 0.2, 1.38, 2.99]
        expected = [
            [0.5, 0.5, 0, 0, 0, 0, 0],
            [0, 0, 0.75, 0.25, 0, 0, 0],
            [0, 0, 0, 0.8, 0.2, 0, 0],
            [0, 0, 0, 0, 0.62, 0.38, 0],
            [0, 0, 0, 0, 0, 0.01, 0.99]
        ]
        bins = scalar_to_support(scalars, 3, reward_transformer=lambda x: x)
        np.testing.assert_array_almost_equal(expected, bins)
Ejemplo n.º 5
0
    def log_batch(self, data_batch: typing.List) -> None:
        if DEBUG_MODE and self.reference.steps % LOG_RATE == 0:
            observations, targets, sample_weight = list(zip(*data_batch))
            target_pis, target_vs = list(map(np.asarray, zip(*targets)))
            observations = np.asarray(observations)

            priority = sample_weight * len(data_batch)  # Undo 1/n scaling to get priority
            tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps)

            pis, vs = self.reference.neural_net.model.predict_on_batch(observations)
            # Unpack [batch-size, dims] to [batch-size,]
            v_reals = support_to_scalar(vs, self.reference.net_args.support_size).ravel()

            tf.summary.histogram(f"v_targets", data=target_vs, step=self.reference.steps)
            tf.summary.histogram(f"v_predict", data=v_reals, step=self.reference.steps)

            mse = np.mean((v_reals - target_vs) ** 2)
            tf.summary.scalar("v_mse", data=mse, step=self.reference.steps)
Ejemplo n.º 6
0
    def predict(self,
                observations: np.ndarray) -> typing.Tuple[np.ndarray, float]:
        """
        Infer the neural network move probability prior and state value given a state observation.

        The observation array is padded with a batch-size dimension of length 1. The inferred state value is
        cast from its distributional bins into a scalar.

        :param observations: Observation representation of the form (width x height x (depth * time)
        :return: A tuple with predictions of the following form:
            pi: a policy vector for the provided state - a numpy array of length |action_space|.
            v: a float that gives the state value estimate of the provided state.
        """
        # Pad input with batch dimension
        observation = observations[np.newaxis, ...]

        pi, v = self.neural_net.model.predict(observation)

        # Cast distribution bins to scalar
        v_real = support_to_scalar(v, self.net_args.support_size)

        return pi[0], np.ndarray.item(v_real)
Ejemplo n.º 7
0
    def log_batch(self, data_batch: typing.List) -> None:
        """
        Log a large amount of neural network statistics based on the given batch.
        Functionality can be toggled on by specifying '--debug' as a console argument to Main.py.
        Note: toggling this functionality on will produce significantly larger tensorboard event files!

        Statistics include:
         - Priority sampling sample probabilities.
         - Loss of each recurrent head per sample as a distribution.
         - Loss discrepancy between cross-entropy and MSE for the reward/ value predictions.
         - Norm of the neural network's weights.
         - Divergence between the dynamics and encoder functions.
         - Squared error of the decoding function.
        """
        if DEBUG_MODE and self.reference.steps % LOG_RATE == 0:
            observations, actions, targets, forward_observations, sample_weight = list(zip(*data_batch))
            actions, sample_weight = np.asarray(actions), np.asarray(sample_weight)
            target_vs, target_rs, target_pis = list(map(np.asarray, zip(*targets)))

            priority = sample_weight * len(data_batch)  # Undo 1/n scaling to get priority
            tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps)

            s, pi, v = self.reference.neural_net.forward.predict_on_batch(np.asarray(observations))

            v_real = support_to_scalar(v, self.reference.net_args.support_size).ravel()

            tf.summary.histogram(f"v_predict_{0}", data=v_real, step=self.reference.steps)
            tf.summary.histogram(f"v_target_{0}", data=target_vs[:, 0], step=self.reference.steps)
            tf.summary.scalar(f"v_mse_{0}", data=np.mean((v_real - target_vs[:, 0]) ** 2), step=self.reference.steps)

            # Sum over one-hot-encoded actions. If this sum is zero, then there is no action --> leaf node.
            absorb_k = 1.0 - tf.reduce_sum(target_pis, axis=-1)

            collect = list()
            for k in range(actions.shape[1]):
                r, s, pi, v = self.reference.neural_net.recurrent.predict_on_batch([s, actions[:, k, :]])

                collect.append((s, v, r, pi, absorb_k[k, :]))

            for t, (s, v, r, pi, absorb) in enumerate(collect):
                k = t + 1

                pi_loss = -np.sum(target_pis[:, k] * np.log(pi + 1e-8), axis=-1)
                self.log_distribution(pi_loss, f"pi_dist_{k}")

                v_real = support_to_scalar(v, self.reference.net_args.support_size).ravel()
                r_real = support_to_scalar(r, self.reference.net_args.support_size).ravel()

                self.log_distribution(r_real, f"r_predict_{k}")
                self.log_distribution(v_real, f"v_predict_{k}")

                self.log_distribution(target_rs[:, k], f"r_target_{k}")
                self.log_distribution(target_vs[:, k], f"v_target_{k}")

                self.log(np.mean((r_real - target_rs[:, k]) ** 2), f"r_mse_{k}")
                self.log(np.mean((v_real - target_vs[:, k]) ** 2), f"v_mse_{k}")

            l2_norm = tf.reduce_sum([safe_l2norm(x) for x in self.reference.get_variables()])
            self.log(l2_norm, "l2 norm")

            # Option to track statistical properties of the dynamics model.
            if self.reference.net_args.dynamics_penalty > 0:
                forward_observations = np.asarray(forward_observations)
                # Compute statistics related to auto-encoding state dynamics:
                for t, (s, v, r, pi, absorb) in enumerate(collect):
                    k = t + 1
                    stacked_obs = forward_observations[:, t, ...]

                    s_enc = self.reference.neural_net.encoder.predict_on_batch(stacked_obs)
                    kl_divergence = tf.keras.losses.kullback_leibler_divergence(s_enc, s)

                    # Relative entropy of dynamics model and encoder.
                    # Lower values indicate that the prediction model receives more stable input.
                    self.log_distribution(kl_divergence, f"KL_Divergence_{k}")
                    self.log(np.mean(kl_divergence), f"Mean_KLDivergence_{k}")

                    # Internal entropy of the dynamics model
                    s_entropy = tf.keras.losses.categorical_crossentropy(s, s)
                    self.log(np.mean(s_entropy), f"mean_dynamics_entropy_{k}")

                    if hasattr(self.reference.neural_net, "decoder"):
                        # If available, track the performance of a neural decoder from latent to real state.
                        stacked_obs_predict = self.reference.neural_net.decoder.predict_on_batch(s)
                        se = (stacked_obs - stacked_obs_predict) ** 2

                        self.log(np.mean(se), f"decoder_error_{k}")