def __init__(self, time_step_spec, action_spec, seed=None, outer_dims=None): """Initializes the RandomPyPolicy. Args: time_step_spec: Reference `time_step_spec`. If not None and outer_dims is not provided this is used to infer the outer_dims required for the given time_step when action is called. action_spec: A nest of BoundedArraySpec representing the actions to sample from. seed: Optional seed used to instantiate a random number generator. outer_dims: An optional list/tuple specifying outer dimensions to add to the spec shape before sampling. If unspecified the outer_dims are derived from the outer_dims in the given observation when `action` is called. """ self._seed = seed self._outer_dims = outer_dims self._rng = np.random.RandomState(seed) if time_step_spec is None: time_step_spec = ts.time_step_spec() super(RandomPyPolicy, self).__init__( time_step_spec=time_step_spec, action_spec=action_spec)
def testTrainPerArmAgent(self): obs_spec = bandit_spec_utils.create_per_arm_observation_spec( 2, 3, 4, add_num_actions_feature=True) time_step_spec = ts.time_step_spec(obs_spec) reward_net = (global_and_arm_feature_network. create_feed_forward_common_tower_network( obs_spec, (4, 3), (3, 4), (4, 2))) optimizer = tf.compat.v1.train.GradientDescentOptimizer( learning_rate=0.1) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, accepts_per_arm_features=True, optimizer=optimizer) observations = { bandit_spec_utils.GLOBAL_FEATURE_KEY: tf.constant([[1, 2], [3, 4]], dtype=tf.float32), bandit_spec_utils.PER_ARM_FEATURE_KEY: tf.cast(tf.reshape(tf.range(24), shape=[2, 4, 3]), dtype=tf.float32), bandit_spec_utils.NUM_ACTIONS_FEATURE_KEY: tf.ones([2], dtype=tf.int32) } actions = np.array([0, 3], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps( observations, rewards) action_step = policy_step.PolicyStep( action=tf.convert_to_tensor(actions), info=policy_utilities.PerArmPolicyInfo( chosen_arm_features=np.array([[1, 2, 3], [3, 2, 1]], dtype=np.float32))) experience = _get_experience(initial_step, action_step, final_step) agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables())
def testDropoutFCLayersWithConv(self, training): observation_spec = tensor_spec.BoundedTensorSpec((8, 8, 3), tf.float32, 0, 1) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest(time_step_spec, outer_dims=(1, )) action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3) net = actor_distribution_network.ActorDistributionNetwork( observation_spec, action_spec, conv_layer_params=[(4, 2, 2)], fc_layer_params=[5], dropout_layer_params=[0.5]) modes = [] num_modes = 10 for _ in range(num_modes): action_distributions, _ = net(time_step.observation, time_step.step_type, (), training=training) modes.append(action_distributions.mode()) self.evaluate(tf.compat.v1.global_variables_initializer()) modes = self.evaluate(modes) # Verify that the modes from action_distributions are not all the same. any_modes_differ = False for i in range(num_modes): for j in range(i + 1, num_modes): any_modes_differ = np.linalg.norm(modes[i] - modes[j]) > 1e-6 if any_modes_differ: self.assertEqual(training, any_modes_differ) return self.assertEqual(training, any_modes_differ)
def test_handle_preprocessing_layers(self, outer_dims): observation_spec = (tensor_spec.TensorSpec([1], tf.float32), tensor_spec.TensorSpec([], tf.float32)) time_step_spec = ts.time_step_spec(observation_spec) time_step = tensor_spec.sample_spec_nest( time_step_spec, outer_dims=outer_dims) action_spec = tensor_spec.BoundedTensorSpec((2, ), tf.float32, 2, 3) preprocessing_layers = (tf.keras.layers.Dense(4), sequential_layer.SequentialLayer([ tf.keras.layers.Reshape((1, )), tf.keras.layers.Dense(4) ])) net = actor_network.ActorNetwork( observation_spec, action_spec, preprocessing_layers=preprocessing_layers, preprocessing_combiner=tf.keras.layers.Add()) action, _ = net(time_step.observation, time_step.step_type, ()) self.assertEqual(list(outer_dims) + [2], action.shape.as_list()) self.assertGreater(len(net.trainable_variables), 4)
def testExp3Update(self, observation_shape, num_actions, action, log_prob, reward, learning_rate): """Check EXP3 updates for specified actions and rewards.""" # Compute expected update for each action. expected_update_value = exp3_agent.exp3_update_value(reward, log_prob) expected_update = np.zeros(num_actions) for a, u in zip(action, self.evaluate(expected_update_value)): expected_update[a] += u # Construct a `Trajectory` for the given action, log prob and reward. time_step_spec = time_step.time_step_spec( tensor_spec.TensorSpec(observation_shape, tf.float32)) action_spec = tensor_spec.BoundedTensorSpec(dtype=tf.int32, shape=(), minimum=0, maximum=num_actions - 1) initial_step, final_step = _get_initial_and_final_steps( observation_shape, reward) action_step = _get_action_step(action, log_prob) experience = _get_experience(initial_step, action_step, final_step) # Construct an agent and perform the update. Record initial and final # weights. agent = exp3_agent.Exp3Agent(time_step_spec=time_step_spec, action_spec=action_spec, learning_rate=learning_rate) self.evaluate(agent.initialize()) initial_weights = self.evaluate(agent.weights) loss_info = agent.train(experience) self.evaluate(loss_info) final_weights = self.evaluate(agent.weights) update = final_weights - initial_weights # Check that the actual update matches expectations. self.assertAllClose(expected_update, update)
def testTrainAgentWithMask(self): reward_net = DummyNet(self._observation_spec, self._action_spec) optimizer = tf.compat.v1.train.GradientDescentOptimizer(learning_rate=0.1) time_step_spec = ts.time_step_spec((tensor_spec.TensorSpec([2], tf.float32), tensor_spec.TensorSpec([3], tf.int32))) agent = greedy_agent.GreedyRewardPredictionAgent( time_step_spec, self._action_spec, reward_network=reward_net, optimizer=optimizer, observation_and_action_constraint_splitter=lambda x: (x[0], x[1])) observations = (np.array([[1, 2], [3, 4]], dtype=np.float32), np.array([[1, 0, 0], [1, 1, 0]], dtype=np.int32)) actions = np.array([0, 1], dtype=np.int32) rewards = np.array([0.5, 3.0], dtype=np.float32) initial_step, final_step = _get_initial_and_final_steps_with_action_mask( observations, rewards) action_step = _get_action_step(actions) experience = _get_experience(initial_step, action_step, final_step) loss_before, _ = agent.train(experience, None) loss_after, _ = agent.train(experience, None) self.evaluate(tf.compat.v1.initialize_all_variables()) self.assertAllClose(self.evaluate(loss_before), 42.25) self.assertAllClose(self.evaluate(loss_after), 93.46)