def __init__(self, q_values, observations, num_actions, cur_epsilon, softmax, softmax_temp, model_config): if softmax: action_dist = Categorical(q_values / softmax_temp) self.action = action_dist.sample() self.action_prob = tf.exp(action_dist.sampled_action_logp()) return deterministic_actions = tf.argmax(q_values, axis=1) batch_size = tf.shape(observations)[0] # Special case masked out actions (q_value ~= -inf) so that we don't # even consider them for exploration. random_valid_action_logits = tf.where( tf.equal(q_values, tf.float32.min), tf.ones_like(q_values) * tf.float32.min, tf.ones_like(q_values)) random_actions = tf.squeeze( tf.multinomial(random_valid_action_logits, 1), axis=1) chose_random = tf.random_uniform( tf.stack([batch_size]), minval=0, maxval=1, dtype=tf.float32) < cur_epsilon self.action = tf.where(chose_random, random_actions, deterministic_actions) self.action_prob = None
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader(self.options["custom_options"]["input_files"]) input_ops = reader.tf_input_ops() # define a secondary loss by building a graph copy with weight sharing obs = tf.cast(input_ops["obs"], tf.float32) logits, _ = self._build_layers_v2( {"obs": restore_original_dimensions(obs, self.obs_space)}, self.num_outputs, self.options) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss action_dist = Categorical(logits, self.options) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean( -action_dist.logp(input_ops["actions"])) total_loss = policy_loss + self.options["custom_options"]["lambda1"]\ * policy_loss + self.options["custom_options"]["lambda2"]\ * self.imitation_loss return total_loss
def sample_action_from_q_network(policy, q_model, input_dict, obs_space, action_space, config): # Action Q network. q_values, q_logits, q_dist = _compute_q_values( policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space, action_space) policy.q_values = q_values policy.q_func_vars = q_model.variables() # Noise vars for Q network except for layer normalization vars if config["parameter_noise"]: _build_parameter_noise( policy, [var for var in policy.q_func_vars if "LayerNorm" not in var.name]) policy.action_probs = tf.nn.softmax(policy.q_values) # TODO(sven): Move soft_q logic to different Exploration child-component. action_log_prob = None if config["soft_q"]: action_dist = Categorical(q_values / config["softmax_temp"]) policy.output_actions = action_dist.sample() action_log_prob = action_dist.sampled_action_logp() policy.action_prob = tf.exp(action_log_prob) else: policy.output_actions = tf.argmax(q_values, axis=1) policy.action_prob = None return policy.output_actions, action_log_prob
def get_log_likelihood(policy, q_model, actions, input_dict, obs_space, action_space, config): # Action Q network. q_vals = _compute_q_values(policy, q_model, input_dict[SampleBatch.CUR_OBS], obs_space, action_space) q_vals = q_vals[0] if isinstance(q_vals, tuple) else q_vals action_dist = Categorical(q_vals, q_model) return action_dist.logp(actions)
def _postprocess_helper_tf(self, obs, next_obs, actions): with (tf.GradientTape() if self.framework != "tf" else NullContextManager()) as tape: # Push both observations through feature net to get both phis. phis, _ = self.model._curiosity_feature_net({ SampleBatch.OBS: tf.concat([obs, next_obs], axis=0) }) phi, next_phi = tf.split(phis, 2) # Predict next phi with forward model. predicted_next_phi = self.model._curiosity_forward_fcnet( tf.concat( [phi, tf_one_hot(actions, self.action_space)], axis=-1)) # Forward loss term (predicted phi', given phi and action vs # actually observed phi'). forward_l2_norm_sqared = 0.5 * tf.reduce_sum( tf.square(predicted_next_phi - next_phi), axis=-1) forward_loss = tf.reduce_mean(forward_l2_norm_sqared) # Inverse loss term (prediced action that led from phi to phi' vs # actual action taken). phi_cat_next_phi = tf.concat([phi, next_phi], axis=-1) dist_inputs = self.model._curiosity_inverse_fcnet(phi_cat_next_phi) action_dist = Categorical(dist_inputs, self.model) if \ isinstance(self.action_space, Discrete) else \ MultiCategorical( dist_inputs, self.model, self.action_space.nvec) # Neg log(p); p=probability of observed action given the inverse-NN # predicted action distribution. inverse_loss = -action_dist.logp(actions) inverse_loss = tf.reduce_mean(inverse_loss) # Calculate the ICM loss. loss = (1.0 - self.beta) * inverse_loss + self.beta * forward_loss # Step the optimizer. if self.framework != "tf": grads = tape.gradient(loss, self._optimizer_var_list) grads_and_vars = [(g, v) for g, v in zip(grads, self._optimizer_var_list) if g is not None] update_op = self._optimizer.apply_gradients(grads_and_vars) else: update_op = self._optimizer.minimize( loss, var_list=self._optimizer_var_list) # Return the squared l2 norm and the optimizer update op. return forward_l2_norm_sqared, update_op
def testCategorical(self): num_samples = 100000 logits = tf.placeholder(tf.float32, shape=(None, 10)) z = 8 * (np.random.rand(10) - 0.5) data = np.tile(z, (num_samples, 1)) c = Categorical(logits, {}) # dummy config dict sample_op = c.sample() sess = tf.Session() sess.run(tf.global_variables_initializer()) samples = sess.run(sample_op, feed_dict={logits: data}) counts = np.zeros(10) for sample in samples: counts[sample] += 1.0 probs = np.exp(z) / np.sum(np.exp(z)) self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
def custom_loss(self, policy_loss, loss_inputs): # Create a new input reader per worker. reader = JsonReader(self.model_config["custom_model_config"]["input_files"]) input_ops = reader.tf_input_ops() # Define a secondary loss by building a graph copy with weight sharing. obs = restore_original_dimensions( tf.cast(input_ops["obs"], tf.float32), self.obs_space ) logits, _ = self.forward({"obs": obs}, [], None) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # Compute the IL loss. action_dist = Categorical(logits, self.model_config) self.policy_loss = policy_loss self.imitation_loss = tf.reduce_mean(-action_dist.logp(input_ops["actions"])) return policy_loss + 10 * self.imitation_loss
def custom_loss(self, policy_loss, loss_inputs): # create a new input reader per worker reader = JsonReader( self.model_config["custom_model_config"]["input_files"]) input_ops = reader.tf_input_ops( self.model_config["custom_model_config"].get("expert_size", 1)) # define a secondary loss by building a graph copy with weight sharing obs = restore_original_dimensions( tf.cast(input_ops["obs"], tf.float32), self.obs_space) logits, _ = self.forward({"obs": obs}, [], None) # You can also add self-supervised losses easily by referencing tensors # created during _build_layers_v2(). For example, an autoencoder-style # loss can be added as follows: # ae_loss = squared_diff( # loss_inputs["obs"], Decoder(self.fcnet.last_layer)) # print("FYI: You can also use these tensors: {}, ".format(loss_inputs)) # compute the IL loss self.policy_loss = policy_loss (action_scores, model_logits, dist) = self.get_q_value_distributions(logits) model_logits = tf.squeeze(model_logits) action_dist = Categorical(model_logits, self.model_config) expert_logits = tf.cast(input_ops["actions"], tf.int32) expert_action = tf.math.argmax(expert_logits) expert_action_one_hot = tf.one_hot(expert_action, self.num_outputs) model_action = action_dist.deterministic_sample() model_action_one_hot = tf.one_hot(model_action, self.num_outputs) model_expert = model_action_one_hot * expert_action_one_hot imitation_loss = 0 loss_type = self.model_config["custom_model_config"].get("loss", "ce") if loss_type == "ce": imitation_loss = tf.reduce_mean(-action_dist.logp(expert_logits)) elif loss_type == "kl": expert_dist = Categorical(tf.one_hot(expert_logits, \ self.num_outputs), self.model_config) imitation_loss = tf.reduce_mean(-action_dist.kl(expert_dist)) elif loss_type == "dqfd": max_value = float("-inf") Q_select = model_logits # TODO: difference in action_scores,dist and logits for a in range(self.num_outputs): max_value = tf.maximum( Q_select[a] + 0.8 * tf.cast(model_expert[a], tf.float32), max_value) imitation_loss = tf.reduce_mean( 1 * (max_value - Q_select[tf.cast(expert_action, tf.int32)])) self.imitation_loss = imitation_loss total_loss = self.model_config["custom_model_config"]["lambda1"] * policy_loss \ + self.model_config["custom_model_config"]["lambda2"] \ * self.imitation_loss return total_loss
def xyz_compute_actions( self, *, input_dict, explore=True, timestep=None, episodes=None, is_training=False, **kwargs, ) -> Tuple[TensorStructType, List[TensorType], Dict[str, TensorStructType]]: if timestep is None: timestep = self.global_timestep # Compute the Q-values for each possible action, using our Q-value network. q_vals = self._compute_q_values(self.model, input_dict[SampleBatch.OBS], is_training=is_training) # Use a Categorical distribution for the exploration component. # This way, it may either sample storchastically (e.g. when using SoftQ) # or deterministically/greedily (e.g. when using EpsilonGreedy). distribution = Categorical(q_vals, self.model) # Call the exploration component's `get_exploration_action` method to # explore, if necessary. actions, logp = self.exploration.get_exploration_action( action_distribution=distribution, timestep=timestep, explore=explore) # Return (exploration) actions, state_outs (empty list), and extra outs. return ( actions, [], { "q_values": q_vals, SampleBatch.ACTION_LOGP: logp, SampleBatch.ACTION_PROB: tf.exp(logp), SampleBatch.ACTION_DIST_INPUTS: q_vals, }, )
def logp(self, actions): a1, a2 = actions[:, 0], actions[:, 1] a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) a1_logits, a2_logits = self.model.action_model([self.inputs, a1_vec]) return (Categorical(a1_logits).logp(a1) + Categorical(a2_logits).logp(a2))
def _a2_distribution(self, a1): a1_vec = tf.expand_dims(tf.cast(a1, tf.float32), 1) _, a2_logits = self.model.action_model([self.inputs, a1_vec]) a2_dist = Categorical(a2_logits) return a2_dist
def _a1_distribution(self): BATCH = tf.shape(self.inputs)[0] a1_logits, _ = self.model.action_model( [self.inputs, tf.zeros((BATCH, 1))]) a1_dist = Categorical(a1_logits) return a1_dist
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["eager"] = True config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = { SampleBatch.CUR_OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]) } # tf. trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() # Post-process (calculate simple (non-GAE) advantages) and attach to # train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch = pg.post_process_advantages(policy, train_batch) # Check Advantage values. check(train_batch[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. results = pg.pg_tf_loss(policy, policy.model, dist_class=Categorical, train_batch=train_batch) # Calculate expected results. expected_logits = fc( fc(train_batch[SampleBatch.CUR_OBS], vars[0].numpy(), vars[1].numpy()), vars[2].numpy(), vars[3].numpy()) expected_logp = Categorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp * train_batch[Postprocessing.ADVANTAGES]) check(results.numpy(), expected_loss, decimals=4) # Torch. config["use_pytorch"] = True trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() train_batch = policy._lazy_tensor_dict(train_batch) results = pg.pg_torch_loss(policy, policy.model, dist_class=TorchCategorical, train_batch=train_batch) expected_logits = policy.model.last_output() expected_logp = TorchCategorical(expected_logits, policy.model).logp( train_batch[SampleBatch.ACTIONS]) expected_loss = -np.mean( expected_logp.detach().numpy() * train_batch[Postprocessing.ADVANTAGES].numpy()) check(results.detach().numpy(), expected_loss, decimals=4)
def _train(self): import tensorflow as tf policy = self.get_policy() steps = 0 n_episodes = 1 for _ in range(n_episodes): env = self.env._env.rail_env obs = self.env.reset() num_outputs = env.action_space[0] n_agents = env.get_num_agents() dispatcher = CellGraphDispatcher(env) # TODO : Update max_steps as per latest version # https://gitlab.aicrowd.com/flatland/flatland-examples/blob/master/reinforcement_learning/multi_agent_training.py # max_steps = int(4 * 2 * (env.height + env.width + (n_agents / n_cities))) - 1 max_steps = int(4 * 2 * (20 + env.height + env.width)) episode_steps = 0 episode_max_steps = 0 episode_num_agents = 0 episode_score = 0 episode_done_agents = 0 done = {} done["__all__"] = False # TODO: Support for batch update # batch_size = 2 # logits, _ = policy.model.forward({"obs": np.vstack([obs[a],obs[a]])}, [], None) for step in range(max_steps): action_dict = dispatcher.step(env._elapsed_steps) with tf.GradientTape() as tape: imitation_loss = 0 active_agents = 0 for a in range(n_agents): if not done.get(a) and obs.get(a) is not None: active_agents += 1 expert_action = action_dict[a].value input_dict = {"obs": np.expand_dims(obs[a], 0)} input_dict['obs_flat'] = input_dict['obs'] logits, _ = policy.model.forward( input_dict, [], None) model_logits = tf.squeeze(logits) expert_logits = tf.cast(expert_action, tf.int32) action_dist = Categorical( logits, policy.model.model_config) imitation_loss += tf.reduce_mean(-action_dist.logp( tf.expand_dims(expert_logits, 0))) imitation_loss = imitation_loss / max(active_agents, 1) gradients = tape.gradient(imitation_loss, policy.model.trainable_variables()) self.workers.local_worker().apply_gradients(gradients) weights = ray.put(self.workers.local_worker().get_weights()) # print(self.workers.local_worker().get_weights()['default_policy'][0][:4]) for e in self.workers.remote_workers(): e.set_weights.remote(weights) obs, all_rewards, done, info = self.env.step(action_dict) steps += 1 for agent, agent_info in info.items(): if agent_info["agent_done"]: episode_done_agents += 1 if done["__all__"]: for agent, agent_info in info.items(): if episode_max_steps == 0: episode_max_steps = agent_info["max_episode_steps"] episode_num_agents = agent_info["num_agents"] episode_steps = max(episode_steps, agent_info["agent_step"]) episode_score += agent_info["agent_score"] print(float(episode_done_agents) / episode_num_agents) break norm_factor = 1.0 / (episode_max_steps * episode_num_agents) result = { "expert_episode_reward_mean": episode_score, "episode_reward_mean": episode_score, "expert_episode_completion_mean": float(episode_done_agents) / episode_num_agents, "expert_episode_score_normalized": episode_score * norm_factor, "episodes_this_iter": n_episodes, "timesteps_this_iter": steps, } # Code taken from _train method of trainer_template.py - TODO: Not working # res = self.collect_metrics() # res = {} # res.update( # optimizer_steps_this_iter=steps, # episode_reward_mean=episode_score, # info=res.get("info", {})) # res.update(expert_scores = result) return result
def _actions_distribution(self): a_dists = list(Categorical(logit) for logit in self.model.logits) return a_dists