def testCategorical(self): num_samples = 100000 logits = tf.placeholder(tf.float32, shape=(None, 10)) z = 8 * (np.random.rand(10) - 0.5) data = np.tile(z, (num_samples, 1)) c = Categorical(logits) sample_op = c.sample() sess = tf.Session() sess.run(tf.global_variables_initializer()) samples = sess.run(sample_op, feed_dict={logits: data}) counts = np.zeros(10) for sample in samples: counts[sample] += 1.0 probs = np.exp(z) / np.sum(np.exp(z)) self.assertTrue(np.sum(np.abs(probs - counts / num_samples)) <= 0.01)
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }