def recurrent_inference( self, latent_state: np.ndarray, action: int) -> typing.Tuple[float, np.ndarray, np.ndarray, float]: """ Combines the prediction and dynamics implementations into one call. This reduces overhead. Integer actions are encoded to one-hot-encoded vectors. Both the latent state and action vector are padded with a batch-size dimensions of length 1. Inferred reward and state value values are cast from their distributional bins into scalars. :param latent_state: A neural encoding of the environment at step k: s_k. :param action: A (encoded) action to perform on the latent state :return: A tuple with predictions of the following form: r: The immediate predicted reward of the environment. s_(k+1): A new 'latent_state' resulting from performing the 'action' in the latent_state. pi: a policy vector for the provided state - a numpy array of length |action_space|. v: a float that gives the state value estimate of the provided state. """ # One hot encode integer actions. a_plane = np.zeros(self.action_size) a_plane[action] = 1 # Pad batch dimension latent_state = latent_state[np.newaxis, ...] a_plane = a_plane[np.newaxis, ...] r, s_next, pi, v = self.neural_net.recurrent.predict( [latent_state, a_plane]) # Cast bins to scalar v_real = support_to_scalar(v, self.net_args.support_size) r_real = support_to_scalar(r, self.net_args.support_size) return np.ndarray.item(r_real), s_next[0], pi[0], np.ndarray.item( v_real)
def log_batch(self, data_batch: typing.List) -> None: """ Log a large amount of neural network statistics based on the given batch. Functionality can be toggled on by specifying '--debug' as a console argument to Main.py. Note: toggling this functionality on will produce significantly larger tensorboard event files! Statistics include: - Priority sampling sample probabilities. - Values of each target/ prediction for the data batch. - Loss discrepancy between cross-entropy and MSE for the reward/ value predictions. """ if DEBUG_MODE and self.reference.steps % LOG_RATE == 0: observations, targets, sample_weight = list(zip(*data_batch)) target_pis, target_vs = list(map(np.asarray, zip(*targets))) observations = np.asarray(observations) priority = sample_weight * len(data_batch) # Undo 1/n scaling to get priority tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps) pis, vs = self.reference.neural_net.model.predict_on_batch(observations) v_reals = support_to_scalar(vs, self.reference.net_args.support_size).ravel() # as scalars tf.summary.histogram(f"v_targets", data=target_vs, step=self.reference.steps) tf.summary.histogram(f"v_predict", data=v_reals, step=self.reference.steps) mse = np.mean((v_reals - target_vs) ** 2) tf.summary.scalar("v_mse", data=mse, step=self.reference.steps)
def initial_inference( self, observations: np.ndarray ) -> typing.Tuple[np.ndarray, np.ndarray, float]: """ Combines the prediction and representation implementations into one call. This reduces overhead and results in a significant speed up. The observation array is padded with a batch-size dimension of length 1. The inferred state value is cast from its distributional bins into a scalar. :param observations: A game specific (stacked) tensor of observations of the environment at step t: o_t. :return: A tuple with predictions of the following form: s_(0): The root 'latent_state' produced by the representation function pi: a policy vector for the provided state - a numpy array of length |action_space|. v: a float that gives the state value estimate of the provided state. """ # Pad batch dimension observations = observations[np.newaxis, ...] s_0, pi, v = self.neural_net.forward.predict(observations) # Cast bins to scalar v_real = support_to_scalar(v, self.net_args.support_size) return s_0[0], pi[0], np.ndarray.item(v_real)
def test_reward_distribution_transformation(self): bins = 300 # Ensure that bins is large enough to support 'high'. n = 10 # Number of samples to draw high = 1e3 # Factor to scale the randomly generated rewards # Generate some random (large) values scalars = np.random.randn(n) * high # Cast scalars to support points of a categorical distribution. support = scalar_to_support(scalars, bins) # Ensure correct dimensionality self.assertEqual(support.shape, (n, bins * 2 + 1)) # Cast support points back to scalars. inverted = support_to_scalar(support, bins) # Ensure correct dimensionality self.assertEqual(inverted.shape, scalars.shape) # Scalar to support and back to scalars should be equal. np.testing.assert_array_almost_equal(scalars, inverted) # Test bin creation explicitly against manually calculated example. scalars = [-2.5, -0.75, 0.2, 1.38, 2.99] expected = [ [0.5, 0.5, 0, 0, 0, 0, 0], [0, 0, 0.75, 0.25, 0, 0, 0], [0, 0, 0, 0.8, 0.2, 0, 0], [0, 0, 0, 0, 0.62, 0.38, 0], [0, 0, 0, 0, 0, 0.01, 0.99] ] bins = scalar_to_support(scalars, 3, reward_transformer=lambda x: x) np.testing.assert_array_almost_equal(expected, bins)
def log_batch(self, data_batch: typing.List) -> None: if DEBUG_MODE and self.reference.steps % LOG_RATE == 0: observations, targets, sample_weight = list(zip(*data_batch)) target_pis, target_vs = list(map(np.asarray, zip(*targets))) observations = np.asarray(observations) priority = sample_weight * len(data_batch) # Undo 1/n scaling to get priority tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps) pis, vs = self.reference.neural_net.model.predict_on_batch(observations) # Unpack [batch-size, dims] to [batch-size,] v_reals = support_to_scalar(vs, self.reference.net_args.support_size).ravel() tf.summary.histogram(f"v_targets", data=target_vs, step=self.reference.steps) tf.summary.histogram(f"v_predict", data=v_reals, step=self.reference.steps) mse = np.mean((v_reals - target_vs) ** 2) tf.summary.scalar("v_mse", data=mse, step=self.reference.steps)
def predict(self, observations: np.ndarray) -> typing.Tuple[np.ndarray, float]: """ Infer the neural network move probability prior and state value given a state observation. The observation array is padded with a batch-size dimension of length 1. The inferred state value is cast from its distributional bins into a scalar. :param observations: Observation representation of the form (width x height x (depth * time) :return: A tuple with predictions of the following form: pi: a policy vector for the provided state - a numpy array of length |action_space|. v: a float that gives the state value estimate of the provided state. """ # Pad input with batch dimension observation = observations[np.newaxis, ...] pi, v = self.neural_net.model.predict(observation) # Cast distribution bins to scalar v_real = support_to_scalar(v, self.net_args.support_size) return pi[0], np.ndarray.item(v_real)
def log_batch(self, data_batch: typing.List) -> None: """ Log a large amount of neural network statistics based on the given batch. Functionality can be toggled on by specifying '--debug' as a console argument to Main.py. Note: toggling this functionality on will produce significantly larger tensorboard event files! Statistics include: - Priority sampling sample probabilities. - Loss of each recurrent head per sample as a distribution. - Loss discrepancy between cross-entropy and MSE for the reward/ value predictions. - Norm of the neural network's weights. - Divergence between the dynamics and encoder functions. - Squared error of the decoding function. """ if DEBUG_MODE and self.reference.steps % LOG_RATE == 0: observations, actions, targets, forward_observations, sample_weight = list(zip(*data_batch)) actions, sample_weight = np.asarray(actions), np.asarray(sample_weight) target_vs, target_rs, target_pis = list(map(np.asarray, zip(*targets))) priority = sample_weight * len(data_batch) # Undo 1/n scaling to get priority tf.summary.histogram(f"sample probability", data=priority, step=self.reference.steps) s, pi, v = self.reference.neural_net.forward.predict_on_batch(np.asarray(observations)) v_real = support_to_scalar(v, self.reference.net_args.support_size).ravel() tf.summary.histogram(f"v_predict_{0}", data=v_real, step=self.reference.steps) tf.summary.histogram(f"v_target_{0}", data=target_vs[:, 0], step=self.reference.steps) tf.summary.scalar(f"v_mse_{0}", data=np.mean((v_real - target_vs[:, 0]) ** 2), step=self.reference.steps) # Sum over one-hot-encoded actions. If this sum is zero, then there is no action --> leaf node. absorb_k = 1.0 - tf.reduce_sum(target_pis, axis=-1) collect = list() for k in range(actions.shape[1]): r, s, pi, v = self.reference.neural_net.recurrent.predict_on_batch([s, actions[:, k, :]]) collect.append((s, v, r, pi, absorb_k[k, :])) for t, (s, v, r, pi, absorb) in enumerate(collect): k = t + 1 pi_loss = -np.sum(target_pis[:, k] * np.log(pi + 1e-8), axis=-1) self.log_distribution(pi_loss, f"pi_dist_{k}") v_real = support_to_scalar(v, self.reference.net_args.support_size).ravel() r_real = support_to_scalar(r, self.reference.net_args.support_size).ravel() self.log_distribution(r_real, f"r_predict_{k}") self.log_distribution(v_real, f"v_predict_{k}") self.log_distribution(target_rs[:, k], f"r_target_{k}") self.log_distribution(target_vs[:, k], f"v_target_{k}") self.log(np.mean((r_real - target_rs[:, k]) ** 2), f"r_mse_{k}") self.log(np.mean((v_real - target_vs[:, k]) ** 2), f"v_mse_{k}") l2_norm = tf.reduce_sum([safe_l2norm(x) for x in self.reference.get_variables()]) self.log(l2_norm, "l2 norm") # Option to track statistical properties of the dynamics model. if self.reference.net_args.dynamics_penalty > 0: forward_observations = np.asarray(forward_observations) # Compute statistics related to auto-encoding state dynamics: for t, (s, v, r, pi, absorb) in enumerate(collect): k = t + 1 stacked_obs = forward_observations[:, t, ...] s_enc = self.reference.neural_net.encoder.predict_on_batch(stacked_obs) kl_divergence = tf.keras.losses.kullback_leibler_divergence(s_enc, s) # Relative entropy of dynamics model and encoder. # Lower values indicate that the prediction model receives more stable input. self.log_distribution(kl_divergence, f"KL_Divergence_{k}") self.log(np.mean(kl_divergence), f"Mean_KLDivergence_{k}") # Internal entropy of the dynamics model s_entropy = tf.keras.losses.categorical_crossentropy(s, s) self.log(np.mean(s_entropy), f"mean_dynamics_entropy_{k}") if hasattr(self.reference.neural_net, "decoder"): # If available, track the performance of a neural decoder from latent to real state. stacked_obs_predict = self.reference.neural_net.decoder.predict_on_batch(s) se = (stacked_obs - stacked_obs_predict) ** 2 self.log(np.mean(se), f"decoder_error_{k}")