def build_q_model_and_distribution(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.q_func_vars = policy.q_model.variables() policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_TARGET_SCOPE, q_hiddens=config["hiddens"], dueling=config["dueling"], num_atoms=config["num_atoms"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm) policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model, TorchCategorical
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 dim_actions = action_space.shape[0] low_action = action_space.low high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) def _build_q_network(obs, actions): return QNetwork(ModelCatalog.get_model(obs, 1, config["model"]), actions, config["critic_hiddens"], config["critic_hidden_activation"]).value def _build_p_network(obs): return PNetwork(ModelCatalog.get_model(obs, 1, config["model"]), dim_actions, config["actor_hiddens"], config["actor_hidden_activation"]).action_scores def _build_action_network(p_values, stochastic, eps): return ActionNetwork(p_values, low_action, high_action, stochastic, eps, config["exploration_theta"], config["exploration_sigma"]).actions # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values = _build_p_network(self.cur_observations) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = _build_action_network( p_values, self.stochastic, self.eps) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: self.p_t = _build_p_network(self.obs_t) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1 = _build_p_network(self.obs_tp1) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) output_actions = _build_action_network(self.p_t, deterministic_flag, zero_eps) output_actions_estimated = _build_action_network( p_tp1, deterministic_flag, zero_eps) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: q_t = _build_q_network(self.obs_t, self.act_t) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0 = _build_q_network(self.obs_t, output_actions) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) self.loss = ActorCriticLoss(q_t, q_tp1, q_tp0, self.importance_weights, self.rew_t, self.done_mask, config["gamma"], config["n_step"], config["use_huber"], config["huber_threshold"]) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.total_loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def get_action_dist(action_space, config, dist_type=None, torch=False): """Returns action distribution class and size for the given action space. Args: action_space (Space): Action space of the target gym env. config (dict): Optional model config. dist_type (str): Optional identifier of the action distribution. torch (bool): Optional whether to return PyTorch distribution. Returns: dist_class (ActionDistribution): Python class of the distribution. dist_dim (int): The size of the input vector to the distribution. """ config = config or MODEL_DEFAULTS if config.get("custom_action_dist"): action_dist_name = config["custom_action_dist"] logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) elif isinstance(action_space, gym.spaces.Box): if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: dist = TorchDiagGaussian if torch else DiagGaussian elif dist_type == "deterministic": dist = Deterministic elif isinstance(action_space, gym.spaces.Discrete): dist = TorchCategorical if torch else Categorical elif isinstance(action_space, gym.spaces.Tuple): if torch: raise NotImplementedError("Tuple action spaces not supported " "for Pytorch.") child_dist = [] input_lens = [] for action in action_space.spaces: dist, action_size = ModelCatalog.get_action_dist( action, config) child_dist.append(dist) input_lens.append(action_size) return partial(MultiActionDistribution, child_distributions=child_dist, action_space=action_space, input_lens=input_lens), sum(input_lens) elif isinstance(action_space, Simplex): if torch: raise NotImplementedError("Simplex action spaces not " "supported for Pytorch.") dist = Dirichlet elif isinstance(action_space, gym.spaces.MultiDiscrete): if torch: raise NotImplementedError("MultiDiscrete action spaces not " "supported for Pytorch.") return partial(MultiCategorical, input_lens=action_space.nvec), \ int(sum(action_space.nvec)) elif isinstance(action_space, gym.spaces.Dict): raise NotImplementedError( "Dict action spaces are not supported, consider using " "gym.spaces.Tuple instead") return dist, dist.required_model_output_shape(action_space, config) raise NotImplementedError("Unsupported args: {} {}".format( action_space, dist_type))
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() self.grads = None if isinstance(action_space, gym.spaces.Discrete): is_multidiscrete = False actions_shape = [None] output_hidden_shape = [action_space.n] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): is_multidiscrete = True actions_shape = [None, len(action_space.nvec)] output_hidden_shape = action_space.nvec.astype(np.int32) else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: actions = tf.placeholder(tf.int64, actions_shape, name="ac") dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, sum(output_hidden_shape)], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Unpack behaviour logits unpacked_behaviour_logits = tf.split(behaviour_logits, output_hidden_shape, axis=1) # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) unpacked_outputs = tf.split(self.model.outputs, output_hidden_shape, axis=1) dist_inputs = unpacked_outputs if is_multidiscrete else \ self.model.outputs action_dist = dist_class(dist_inputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def make_time_major(tensor, drop_last=False): """Swaps batch and trajectory axis. Args: tensor: A tensor or list of tensors to reshape. drop_last: A bool indicating whether to drop the last trajectory item. Returns: res: A tensor with swapped axes or a list of tensors with swapped axes. """ if isinstance(tensor, list): return [make_time_major(t, drop_last) for t in tensor] if self.model.state_init: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes res = tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if drop_last: return res[:-1] return res if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards, dtype=tf.bool) # Prepare actions for loss loss_actions = actions if is_multidiscrete else tf.expand_dims(actions, axis=1) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=make_time_major(loss_actions, drop_last=True), actions_logp=make_time_major(action_dist.logp(actions), drop_last=True), actions_entropy=make_time_major(action_dist.entropy(), drop_last=True), dones=make_time_major(dones, drop_last=True), behaviour_logits=make_time_major(unpacked_behaviour_logits, drop_last=True), target_logits=make_time_major(unpacked_outputs, drop_last=True), discount=config["gamma"], rewards=make_time_major(rewards, drop_last=True), values=make_time_major(values, drop_last=True), bootstrap_value=make_time_major(values)[-1], valid_mask=make_time_major(mask, drop_last=True), vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = MultiCategorical(unpacked_outputs) behaviour_dist = MultiCategorical(unpacked_behaviour_logits) kls = model_dist.kl(behaviour_dist) if len(kls) > 1: self.KL_stats = {} for i, kl in enumerate(kls): self.KL_stats.update({ "mean_KL_{}".format(i): tf.reduce_mean(kl), "max_KL_{}".format(i): tf.reduce_max(kl), "median_KL_{}".format(i): tf.contrib.distributions.percentile(kl, 50.0), }) else: self.KL_stats = { "mean_KL": tf.reduce_mean(kls[0]), "max_KL": tf.reduce_max(kls[0]), "median_KL": tf.contrib.distributions.percentile(kls[0], 50.0), } # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": dict( { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(make_time_major(values, drop_last=True), [-1])), }, **self.KL_stats), }
def build_ddpg_models(policy, observation_space, action_space, config): if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format(action_space)) elif len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") if policy.config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = TorchNoopModel if config["use_pytorch"] else NoopModel num_outputs = int(np.product(observation_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=DDPGTorchModel if config["use_pytorch"] else DDPGTFModel, default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) policy.target_model = ModelCatalog.get_model_v2( obs_space=observation_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=DDPGTorchModel if config["use_pytorch"] else DDPGTFModel, default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], twin_q=config["twin_q"], add_layer_norm=(policy.config["exploration_config"].get("type") == "ParameterNoise"), ) return policy.model
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") self.config = config # Create global step for counting the number of update operations. self.global_step = tf.train.get_or_create_global_step() # Create sampling timestep placeholder. timestep = tf.placeholder(tf.int32, (), name="timestep") # use separate optimizers for actor & critic self._actor_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["actor_lr"]) self._critic_optimizer = tf.train.AdamOptimizer( learning_rate=self.config["critic_lr"]) # Observation inputs. self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") with tf.variable_scope(POLICY_SCOPE) as scope: policy_out, self.policy_model = self._build_policy_network( self.cur_observations, observation_space, action_space) self.policy_vars = scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.policy_vars if "LayerNorm" not in var.name ]) # Create exploration component. self.exploration = self._create_exploration(action_space, config) explore = tf.placeholder_with_default(True, (), name="is_exploring") # Action outputs with tf.variable_scope(ACTION_SCOPE): self.output_actions, _ = self.exploration.get_exploration_action( policy_out, Deterministic, self.policy_model, timestep, explore) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # policy network evaluation with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.policy_t, _ = self._build_policy_network( self.obs_t, observation_space, action_space) policy_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target policy network evaluation with tf.variable_scope(POLICY_TARGET_SCOPE) as scope: policy_tp1, _ = self._build_policy_network(self.obs_tp1, observation_space, action_space) target_policy_vars = scope_vars(scope.name) # Action outputs with tf.variable_scope(ACTION_SCOPE, reuse=True): if config["smooth_target_policy"]: target_noise_clip = self.config["target_noise_clip"] clipped_normal_sample = tf.clip_by_value( tf.random_normal(tf.shape(policy_tp1), stddev=self.config["target_noise"]), -target_noise_clip, target_noise_clip) policy_tp1_smoothed = tf.clip_by_value( policy_tp1 + clipped_normal_sample, action_space.low * tf.ones_like(policy_tp1), action_space.high * tf.ones_like(policy_tp1)) else: # no smoothing, just use deterministic actions policy_tp1_smoothed = policy_tp1 # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: # Q-values for given actions & observations in given current q_t, self.q_model = self._build_q_network(self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): # Q-values for current policy (no noise) in given current state q_t_det_policy, _ = self._build_q_network(self.obs_t, observation_space, action_space, self.policy_t) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evaluation with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) target_q_func_vars = scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, policy_tp1_smoothed) twin_target_q_func_vars = scope_vars(scope.name) if self.config["twin_q"]: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.critic_loss, self.actor_loss, self.td_error \ = self._build_actor_critic_loss( q_t, q_tp1, q_t_det_policy) if config["l2_reg"] is not None: for var in self.policy_vars: if "bias" not in var.name: self.actor_loss += (config["l2_reg"] * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.critic_loss += (config["l2_reg"] * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.policy_vars, key=lambda v: v.name), sorted(target_policy_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] input_dict = dict(self.loss_inputs) if self.config["use_state_preprocessor"]: # Model self-supervised losses self.actor_loss = self.policy_model.custom_loss( self.actor_loss, input_dict) self.critic_loss = self.q_model.custom_loss( self.critic_loss, input_dict) if self.config["twin_q"]: self.critic_loss = self.twin_q_model.custom_loss( self.critic_loss, input_dict) TFPolicy.__init__(self, observation_space, action_space, self.config, self.sess, obs_input=self.cur_observations, sampled_action=self.output_actions, loss=self.actor_loss + self.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + policy_batchnorm_update_ops, explore=explore, timestep=timestep) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_t_det_policy, q_tp1, self._actor_optimizer.variables(), self._critic_optimizer.variables()), self.sess) # Hard initial update self.update_target(tau=1.0)
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 num_actions = action_space.n def _build_q_network(obs): return QNetwork( ModelCatalog.get_model(obs, 1, config["model"]), num_actions, config["dueling"], config["hiddens"]).value # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values = _build_q_network(self.cur_observations) self.q_func_vars = _scope_vars(scope.name) # Action outputs self.output_actions = QValuePolicy(q_values, self.cur_observations, num_actions, self.stochastic, self.eps).action # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): q_t = _build_q_network(self.obs_t) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(self.obs_tp1) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. q_t_selected = tf.reduce_sum(q_t * tf.one_hot(self.act_t, num_actions), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net = _build_q_network(self.obs_tp1) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best = tf.reduce_sum( q_tp1 * tf.one_hot(q_tp1_best_using_online_net, num_actions), 1) else: q_tp1_best = tf.reduce_max(q_tp1, 1) self.loss = QLoss(q_t_selected, q_tp1_best, self.importance_weights, self.rew_t, self.done_mask, config["gamma"], config["n_step"]) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(self.target_q_func_vars, key=lambda v: v.name)): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.loss, loss_inputs=self.loss_inputs) self.sess.run(tf.global_variables_initializer())
def get_action_dist(action_space, config, dist_type=None, framework="tf", **kwargs): """Returns a distribution class and size for the given action space. Args: action_space (Space): Action space of the target gym env. config (Optional[dict]): Optional model config. dist_type (Optional[str]): Identifier of the action distribution interpreted as a hint. framework (str): One of "tf", "tfe", or "torch". kwargs (dict): Optional kwargs to pass on to the Distribution's constructor. Returns: Tuple: - dist_class (ActionDistribution): Python class of the distribution. - dist_dim (int): The size of the input vector to the distribution. """ dist = None config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): action_dist_name = config["custom_action_dist"] logger.debug( "Using custom action distribution {}".format(action_dist_name)) dist = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) # Dist_type is given directly as a class. elif type(dist_type) is type and \ issubclass(dist_type, ActionDistribution) and \ dist_type not in ( MultiActionDistribution, TorchMultiActionDistribution): dist = dist_type # Box space -> DiagGaussian OR Deterministic. elif isinstance(action_space, gym.spaces.Box): if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API.") # TODO(sven): Check for bounds and return SquashedNormal, etc.. if dist_type is None: dist = TorchDiagGaussian if framework == "torch" \ else DiagGaussian elif dist_type == "deterministic": dist = TorchDeterministic if framework == "torch" \ else Deterministic # Discrete Space -> Categorical. elif isinstance(action_space, gym.spaces.Discrete): dist = TorchCategorical if framework == "torch" else Categorical # Tuple/Dict Spaces -> MultiAction. elif dist_type in (MultiActionDistribution, TorchMultiActionDistribution) or \ isinstance(action_space, (gym.spaces.Tuple, gym.spaces.Dict)): flat_action_space = flatten_space(action_space) child_dists_and_in_lens = tree.map_structure( lambda s: ModelCatalog.get_action_dist( s, config, framework=framework), flat_action_space) child_dists = [e[0] for e in child_dists_and_in_lens] input_lens = [int(e[1]) for e in child_dists_and_in_lens] return partial( (TorchMultiActionDistribution if framework == "torch" else MultiActionDistribution), action_space=action_space, child_distributions=child_dists, input_lens=input_lens), int(sum(input_lens)) # Simplex -> Dirichlet. elif isinstance(action_space, Simplex): if framework == "torch": # TODO(sven): implement raise NotImplementedError( "Simplex action spaces not supported for torch.") dist = Dirichlet # MultiDiscrete -> MultiCategorical. elif isinstance(action_space, gym.spaces.MultiDiscrete): dist = TorchMultiCategorical if framework == "torch" else \ MultiCategorical return partial(dist, input_lens=action_space.nvec), \ int(sum(action_space.nvec)) # Unknown type -> Error. else: raise NotImplementedError("Unsupported args: {} {}".format( action_space, dist_type)) return dist, dist.required_model_output_shape(action_space, config)
def build_q_model_and_distribution(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) # # NOTE: using manually specified embedding layer, no need to switch num_outputs # if config["hiddens"]: # # try to infer the last layer size, otherwise fall back to 256 # num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] # config["model"]["no_final_linear"] = True # else: # num_outputs = action_space.n num_outputs = action_space.n # NOTE: actually no used!!! # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = CurlRainbowTorchModel( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], # name=Q_SCOPE, name="rainbow_model", dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, num_atoms=config["num_atoms"], v_min=config["v_min"], v_max=config["v_max"], # customs embed_dim=config["embed_dim"], encoder_type=config["encoder_type"], num_layers=config["num_layers"], num_filters=config["num_filters"], cropped_image_size=config["cropped_image_size"] ) # policy.q_func_vars = policy.q_model.variables() # only for Q net params (excluding encoder params) policy.q_func_vars = policy.q_model.q_variables() policy.target_q_model = CurlRainbowTorchModel( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], # name=Q_TARGET_SCOPE, name="target_rainbow_model", dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, num_atoms=config["num_atoms"], v_min=config["v_min"], v_max=config["v_max"], # customs embed_dim=config["embed_dim"], encoder_type=config["encoder_type"], num_layers=config["num_layers"], num_filters=config["num_filters"], cropped_image_size=config["cropped_image_size"] ) # policy.target_q_func_vars = policy.target_q_model.variables() policy.target_q_func_vars = policy.target_q_model.q_variables() # NOTE: customs policy.curl = CURL( policy.q_model.embed_dim, policy.q_model.encoder, policy.target_q_model.encoder ) return policy.q_model, TorchCategorical
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.prev_actions = ModelCatalog.get_action_placeholder(action_space) self.prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": self.observations, "prev_actions": self.prev_actions, "prev_rewards": self.prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") self.v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, self.v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("prev_actions", self.prev_actions), ("prev_rewards", self.prev_rewards), ("advantages", advantages), ("value_targets", self.v_target), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__(self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), action_prob=action_dist.sampled_action_prob(), loss=self.loss.total_loss, model=self.model, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=self.prev_actions, prev_reward_input=self.prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "policy_entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance(self.v_target, self.vf), }, } self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.num_actions = action_space.n # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = self._build_q_network( self.cur_observations, observation_space, action_space) self.q_values = q_values self.q_func_vars = _scope_vars(scope.name) # Noise vars for Q network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.q_func_vars if "LayerNorm" not in var.name ]) self.action_probs = tf.nn.softmax(self.q_values) # Action outputs self.output_actions, self.action_prob = self._build_q_value_policy( q_values) # Replay inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.act_t = tf.placeholder(tf.int32, [None], name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder( tf.float32, [None], name="weight") # q network evaluation with tf.variable_scope(Q_SCOPE, reuse=True): prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) q_t, q_logits_t, q_dist_t, model = self._build_q_network( self.obs_t, observation_space, action_space) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, q_logits_tp1, q_dist_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) self.target_q_func_vars = _scope_vars(scope.name) # q scores for actions which we know were selected in the given state. one_hot_selection = tf.one_hot(self.act_t, self.num_actions) q_t_selected = tf.reduce_sum(q_t * one_hot_selection, 1) q_logits_t_selected = tf.reduce_sum( q_logits_t * tf.expand_dims(one_hot_selection, -1), 1) # compute estimate of best possible value starting from state at t + 1 if config["double_q"]: with tf.variable_scope(Q_SCOPE, reuse=True): q_tp1_using_online_net, q_logits_tp1_using_online_net, \ q_dist_tp1_using_online_net, _ = self._build_q_network( self.obs_tp1, observation_space, action_space) q_tp1_best_using_online_net = tf.argmax(q_tp1_using_online_net, 1) q_tp1_best_one_hot_selection = tf.one_hot( q_tp1_best_using_online_net, self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) else: q_tp1_best_one_hot_selection = tf.one_hot( tf.argmax(q_tp1, 1), self.num_actions) q_tp1_best = tf.reduce_sum(q_tp1 * q_tp1_best_one_hot_selection, 1) q_dist_tp1_best = tf.reduce_sum( q_dist_tp1 * tf.expand_dims(q_tp1_best_one_hot_selection, -1), 1) self.loss = self._build_q_loss(q_t_selected, q_logits_t_selected, q_tp1_best, q_dist_tp1_best) # update_target_fn will be called periodically to copy Q network to # target Q network update_target_expr = [] assert len(self.q_func_vars) == len(self.target_q_func_vars), \ (self.q_func_vars, self.target_q_func_vars) for var, var_target in zip(self.q_func_vars, self.target_q_func_vars): update_target_expr.append(var_target.assign(var)) self.update_target_expr = tf.group(*update_target_expr) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, action_prob=self.action_prob, loss=self.loss.loss, model=model, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = dict({ "cur_lr": tf.cast(self.cur_lr, tf.float64), }, **self.loss.stats)
def __init__(self, observation_space, action_space, config, existing_inputs=None): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Create input placeholders if existing_inputs: actions, dones, behaviour_logits, rewards, observations, \ prev_actions, prev_rewards = existing_inputs[:7] existing_state_in = existing_inputs[7:-1] existing_seq_lens = existing_inputs[-1] else: if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder(tf.float32, [None, ac_size], name="behaviour_logits") observations = tf.placeholder(tf.float32, [None] + list(observation_space.shape)) existing_state_in = None existing_seq_lens = None # Setup the policy dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) prev_actions = ModelCatalog.get_action_placeholder(action_space) prev_rewards = tf.placeholder(tf.float32, [None], name="prev_reward") self.model = ModelCatalog.get_model( { "obs": observations, "prev_actions": prev_actions, "prev_rewards": prev_rewards, "is_training": self._get_is_training_placeholder(), }, observation_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) action_dist = dist_class(self.model.outputs) values = self.model.value_function() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = self.config["sample_batch_size"] B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) - 1 mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(rewards) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], valid_mask=to_batches(mask)[:-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # KL divergence between worker and learner logits for debugging model_dist = Categorical(self.model.outputs) behaviour_dist = Categorical(behaviour_logits) self.KLs = model_dist.kl(behaviour_dist) self.mean_KL = tf.reduce_mean(self.KLs) self.max_KL = tf.reduce_max(self.KLs) self.median_KL = tf.contrib.distributions.percentile(self.KLs, 50.0) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", observations), ("prev_actions", prev_actions), ("prev_rewards", prev_rewards), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=observations, action_sampler=action_dist.sample(), loss=self.model.loss() + self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions, prev_reward_input=prev_rewards, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"], batch_divisibility_req=self.config["sample_batch_size"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), "mean_KL": self.mean_KL, "max_KL": self.max_KL, "median_KL": self.median_KL, }, }
def build_sac_model(policy, obs_space, action_space, config): if config["model"].get("custom_model"): logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, (Box, Discrete)): raise UnsupportedSpaceException( "Action space {} is not supported for SAC.".format(action_space)) if isinstance(action_space, Box) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") # 2 cases: # 1) with separate state-preprocessor (before obs+action concat). # 2) no separate state-preprocessor: concat obs+actions right away. if config["use_state_preprocessor"]: num_outputs = 256 # Flatten last Conv2D to this many nodes. else: config["model"]["fcnet_hiddens"] = [] num_outputs = 0 # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. policy.model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel, name="sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) policy.target_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch" if config["use_pytorch"] else "tf", model_interface=SACTorchModel if config["use_pytorch"] else SACTFModel, name="target_sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"]) return policy.model
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.impala.impala.DEFAULT_CONFIG, **config) assert config["batch_mode"] == "truncate_episodes", \ "Must use `truncate_episodes` batch mode with V-trace." self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) values = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Discrete): ac_size = action_space.n actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for IMPALA.".format( action_space)) dones = tf.placeholder(tf.bool, [None], name="dones") rewards = tf.placeholder(tf.float32, [None], name="rewards") behaviour_logits = tf.placeholder( tf.float32, [None, ac_size], name="behaviour_logits") def to_batches(tensor): if self.config["model"]["use_lstm"]: B = tf.shape(self.model.seq_lens)[0] T = tf.shape(tensor)[0] // B else: # Important: chop the tensor into batches at known episode cut # boundaries. TODO(ekl) this is kind of a hack T = (self.config["sample_batch_size"] // self.config["num_envs_per_worker"]) B = tf.shape(tensor)[0] // T rs = tf.reshape(tensor, tf.concat([[B, T], tf.shape(tensor)[1:]], axis=0)) # swap B and T axes return tf.transpose( rs, [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0])))) # Inputs are reshaped from [B * T] => [T - 1, B] for V-trace calc. self.loss = VTraceLoss( actions=to_batches(actions)[:-1], actions_logp=to_batches(action_dist.logp(actions))[:-1], actions_entropy=to_batches(action_dist.entropy())[:-1], dones=to_batches(dones)[:-1], behaviour_logits=to_batches(behaviour_logits)[:-1], target_logits=to_batches(self.model.outputs)[:-1], discount=config["gamma"], rewards=to_batches(rewards)[:-1], values=to_batches(values)[:-1], bootstrap_value=to_batches(values)[-1], vf_loss_coeff=self.config["vf_loss_coeff"], entropy_coeff=self.config["entropy_coeff"], clip_rho_threshold=self.config["vtrace_clip_rho_threshold"], clip_pg_rho_threshold=self.config["vtrace_clip_pg_rho_threshold"]) # Initialize TFPolicyGraph loss_in = [ ("actions", actions), ("dones", dones), ("behaviour_logits", behaviour_logits), ("rewards", rewards), ("obs", self.observations), ] LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "stats": { "cur_lr": tf.cast(self.cur_lr, tf.float64), "policy_loss": self.loss.pi_loss, "entropy": self.loss.entropy, "grad_gnorm": tf.global_norm(self._grads), "var_gnorm": tf.global_norm(self.var_list), "vf_loss": self.loss.vf_loss, "vf_explained_var": explained_variance( tf.reshape(self.loss.vtrace_returns.vs, [-1]), tf.reshape(to_batches(values)[:-1], [-1])), }, }
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.ddpg.ddpg.DEFAULT_CONFIG, **config) if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 self.dim_actions = action_space.shape[0] self.low_action = action_space.low self.high_action = action_space.high # create global step for counting the number of update operations self.global_step = tf.train.get_or_create_global_step() # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="cur_obs") # Actor: P (policy) network with tf.variable_scope(POLICY_SCOPE) as scope: p_values, self.p_model = self._build_p_network( self.cur_observations, observation_space, action_space) self.p_func_vars = _scope_vars(scope.name) # Noise vars for P network except for layer normalization vars if self.config["parameter_noise"]: self._build_parameter_noise([ var for var in self.p_func_vars if "LayerNorm" not in var.name ]) # Action outputs with tf.variable_scope(ACTION_SCOPE): self.output_actions = self._build_action_network( p_values, self.stochastic, self.eps) if self.config["smooth_target_policy"]: self.reset_noise_op = tf.no_op() else: with tf.variable_scope(ACTION_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, self.dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(POLICY_SCOPE, reuse=True) as scope: prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) self.p_t, _ = self._build_p_network(self.obs_t, observation_space, action_space) p_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target p network evaluation with tf.variable_scope(POLICY_TARGET_SCOPE) as scope: p_tp1, _ = self._build_p_network(self.obs_tp1, observation_space, action_space) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(ACTION_SCOPE, reuse=True): output_actions = self._build_action_network(self.p_t, stochastic=tf.constant( value=False, dtype=tf.bool), eps=.0) output_actions_estimated = self._build_action_network( p_tp1, stochastic=tf.constant( value=self.config["smooth_target_policy"], dtype=tf.bool), eps=.0, is_target=True) # q network evaluation prev_update_ops = set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) with tf.variable_scope(Q_SCOPE) as scope: q_t, self.q_model = self._build_q_network(self.obs_t, observation_space, action_space, self.act_t) self.q_func_vars = _scope_vars(scope.name) self.stats = { "mean_q": tf.reduce_mean(q_t), "max_q": tf.reduce_max(q_t), "min_q": tf.reduce_min(q_t), } with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0, _ = self._build_q_network(self.obs_t, observation_space, action_space, output_actions) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_SCOPE) as scope: twin_q_t, self.twin_q_model = self._build_q_network( self.obs_t, observation_space, action_space, self.act_t) self.twin_q_func_vars = _scope_vars(scope.name) q_batchnorm_update_ops = list( set(tf.get_collection(tf.GraphKeys.UPDATE_OPS)) - prev_update_ops) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1, _ = self._build_q_network(self.obs_tp1, observation_space, action_space, output_actions_estimated) target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: with tf.variable_scope(TWIN_Q_TARGET_SCOPE) as scope: twin_q_tp1, _ = self._build_q_network( self.obs_tp1, observation_space, action_space, output_actions_estimated) twin_target_q_func_vars = _scope_vars(scope.name) if self.config["twin_q"]: self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0, twin_q_t=twin_q_t, twin_q_tp1=twin_q_tp1) else: self.loss = self._build_actor_critic_loss(q_t, q_tp1, q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.loss.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) if self.config["twin_q"]: for var in self.twin_q_func_vars: if "bias" not in var.name: self.loss.critic_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) if self.config["twin_q"]: for var, var_target in zip( sorted(self.twin_q_func_vars, key=lambda v: v.name), sorted(twin_target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (SampleBatch.REWARDS, self.rew_t), (SampleBatch.NEXT_OBS, self.obs_tp1), (SampleBatch.DONES, self.done_mask), (PRIO_WEIGHTS, self.importance_weights), ] input_dict = dict(self.loss_inputs) # Model self-supervised losses self.loss.actor_loss = self.p_model.custom_loss( self.loss.actor_loss, input_dict) self.loss.critic_loss = self.q_model.custom_loss( self.loss.critic_loss, input_dict) if self.config["twin_q"]: self.loss.critic_loss = self.twin_q_model.custom_loss( self.loss.critic_loss, input_dict) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss.actor_loss + self.loss.critic_loss, loss_inputs=self.loss_inputs, update_ops=q_batchnorm_update_ops + p_batchnorm_update_ops) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.tf_utils.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)
def get_action_dist( action_space: gym.Space, config: ModelConfigDict, dist_type: Optional[Union[str, Type[ActionDistribution]]] = None, framework: str = "tf", **kwargs ) -> (type, int): """Returns a distribution class and size for the given action space. Args: action_space: Action space of the target gym env. config (Optional[dict]): Optional model config. dist_type (Optional[Union[str, Type[ActionDistribution]]]): Identifier of the action distribution (str) interpreted as a hint or the actual ActionDistribution class to use. framework: One of "tf2", "tf", "tfe", "torch", or "jax". kwargs: Optional kwargs to pass on to the Distribution's constructor. Returns: Tuple: - dist_class (ActionDistribution): Python class of the distribution. - dist_dim (int): The size of the input vector to the distribution. """ dist_cls = None config = config or MODEL_DEFAULTS # Custom distribution given. if config.get("custom_action_dist"): custom_action_config = config.copy() action_dist_name = custom_action_config.pop("custom_action_dist") logger.debug("Using custom action distribution {}".format(action_dist_name)) dist_cls = _global_registry.get(RLLIB_ACTION_DIST, action_dist_name) return ModelCatalog._get_multi_action_distribution( dist_cls, action_space, custom_action_config, framework ) # Dist_type is given directly as a class. elif ( type(dist_type) is type and issubclass(dist_type, ActionDistribution) and dist_type not in (MultiActionDistribution, TorchMultiActionDistribution) ): dist_cls = dist_type # Box space -> DiagGaussian OR Deterministic. elif isinstance(action_space, Box): if action_space.dtype.name.startswith("int"): low_ = np.min(action_space.low) high_ = np.max(action_space.high) dist_cls = ( TorchMultiCategorical if framework == "torch" else MultiCategorical ) num_cats = int(np.product(action_space.shape)) return ( partial( dist_cls, input_lens=[high_ - low_ + 1 for _ in range(num_cats)], action_space=action_space, ), num_cats * (high_ - low_ + 1), ) else: if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a custom action distribution, " "using a Tuple action space, or the multi-agent API." ) # TODO(sven): Check for bounds and return SquashedNormal, etc.. if dist_type is None: return ( partial( TorchDiagGaussian if framework == "torch" else DiagGaussian, action_space=action_space, ), DiagGaussian.required_model_output_shape(action_space, config), ) elif dist_type == "deterministic": dist_cls = ( TorchDeterministic if framework == "torch" else Deterministic ) # Discrete Space -> Categorical. elif isinstance(action_space, Discrete): if framework == "torch": dist_cls = TorchCategorical elif framework == "jax": from ray.rllib.models.jax.jax_action_dist import JAXCategorical dist_cls = JAXCategorical else: dist_cls = Categorical # Tuple/Dict Spaces -> MultiAction. elif ( dist_type in ( MultiActionDistribution, TorchMultiActionDistribution, ) or isinstance(action_space, (Tuple, Dict)) ): return ModelCatalog._get_multi_action_distribution( ( MultiActionDistribution if framework == "tf" else TorchMultiActionDistribution ), action_space, config, framework, ) # Simplex -> Dirichlet. elif isinstance(action_space, Simplex): if framework == "torch": # TODO(sven): implement raise NotImplementedError( "Simplex action spaces not supported for torch." ) dist_cls = Dirichlet # MultiDiscrete -> MultiCategorical. elif isinstance(action_space, MultiDiscrete): dist_cls = ( TorchMultiCategorical if framework == "torch" else MultiCategorical ) return partial(dist_cls, input_lens=action_space.nvec), int( sum(action_space.nvec) ) # Unknown type -> Error. else: raise NotImplementedError( "Unsupported args: {} {}".format(action_space, dist_type) ) return dist_cls, dist_cls.required_model_output_shape(action_space, config)
def build_q_model(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for DQN Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") return policy.q_model
def __init__(self, observation_space, action_space, registry, config): if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( action_space)) self.config = config self.cur_epsilon = 1.0 dim_actions = action_space.shape[0] low_action = action_space.low high_action = action_space.high self.actor_optimizer = tf.train.AdamOptimizer( learning_rate=config["actor_lr"]) self.critic_optimizer = tf.train.AdamOptimizer( learning_rate=config["critic_lr"]) # Action inputs self.stochastic = tf.placeholder(tf.bool, (), name="stochastic") self.eps = tf.placeholder(tf.float32, (), name="eps") self.cur_observations = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) # Actor: P (policy) network with tf.variable_scope(P_SCOPE) as scope: p_values = _build_p_network(registry, self.cur_observations, dim_actions, config) self.p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE): self.output_actions = _build_action_network( p_values, low_action, high_action, self.stochastic, self.eps, config["exploration_theta"], config["exploration_sigma"]) with tf.variable_scope(A_SCOPE, reuse=True): exploration_sample = tf.get_variable(name="ornstein_uhlenbeck") self.reset_noise_op = tf.assign(exploration_sample, dim_actions * [.0]) # Replay inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape, name="observation") self.act_t = tf.placeholder(tf.float32, shape=(None, ) + action_space.shape, name="action") self.rew_t = tf.placeholder(tf.float32, [None], name="reward") self.obs_tp1 = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) self.done_mask = tf.placeholder(tf.float32, [None], name="done") self.importance_weights = tf.placeholder(tf.float32, [None], name="weight") # p network evaluation with tf.variable_scope(P_SCOPE, reuse=True) as scope: self.p_t = _build_p_network(registry, self.obs_t, dim_actions, config) # target p network evaluation with tf.variable_scope(P_TARGET_SCOPE) as scope: p_tp1 = _build_p_network(registry, self.obs_tp1, dim_actions, config) target_p_func_vars = _scope_vars(scope.name) # Action outputs with tf.variable_scope(A_SCOPE, reuse=True): deterministic_flag = tf.constant(value=False, dtype=tf.bool) zero_eps = tf.constant(value=.0, dtype=tf.float32) output_actions = _build_action_network(self.p_t, low_action, high_action, deterministic_flag, zero_eps, config["exploration_theta"], config["exploration_sigma"]) output_actions_estimated = _build_action_network( p_tp1, low_action, high_action, deterministic_flag, zero_eps, config["exploration_theta"], config["exploration_sigma"]) # q network evaluation with tf.variable_scope(Q_SCOPE) as scope: q_t = _build_q_network(registry, self.obs_t, self.act_t, config) self.q_func_vars = _scope_vars(scope.name) with tf.variable_scope(Q_SCOPE, reuse=True): q_tp0 = _build_q_network(registry, self.obs_t, output_actions, config) # target q network evalution with tf.variable_scope(Q_TARGET_SCOPE) as scope: q_tp1 = _build_q_network(registry, self.obs_tp1, output_actions_estimated, config) target_q_func_vars = _scope_vars(scope.name) q_t_selected = tf.squeeze(q_t, axis=len(q_t.shape) - 1) q_tp1_best = tf.squeeze(input=q_tp1, axis=len(q_tp1.shape) - 1) q_tp1_best_masked = (1.0 - self.done_mask) * q_tp1_best # compute RHS of bellman equation q_t_selected_target = ( self.rew_t + config["gamma"]**config["n_step"] * q_tp1_best_masked) # compute the error (potentially clipped) self.td_error = q_t_selected - tf.stop_gradient(q_t_selected_target) if config.get("use_huber"): errors = _huber_loss(self.td_error, config.get("huber_threshold")) else: errors = 0.5 * tf.square(self.td_error) self.loss = tf.reduce_mean(self.importance_weights * errors) # for policy gradient self.actor_loss = -1.0 * tf.reduce_mean(q_tp0) if config["l2_reg"] is not None: for var in self.p_func_vars: if "bias" not in var.name: self.actor_loss += (config["l2_reg"] * 0.5 * tf.nn.l2_loss(var)) for var in self.q_func_vars: if "bias" not in var.name: self.loss += config["l2_reg"] * 0.5 * tf.nn.l2_loss(var) # update_target_fn will be called periodically to copy Q network to # target Q network self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") update_target_expr = [] for var, var_target in zip( sorted(self.q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) for var, var_target in zip( sorted(self.p_func_vars, key=lambda v: v.name), sorted(target_p_func_vars, key=lambda v: v.name)): update_target_expr.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*update_target_expr) self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("rewards", self.rew_t), ("new_obs", self.obs_tp1), ("dones", self.done_mask), ("weights", self.importance_weights), ] self.is_training = tf.placeholder_with_default(True, ()) TFPolicyGraph.__init__(self, self.sess, obs_input=self.cur_observations, action_sampler=self.output_actions, loss=self.loss, loss_inputs=self.loss_inputs, is_training=self.is_training) self.sess.run(tf.global_variables_initializer()) # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(q_tp0, q_tp1), self.sess) # Hard initial update self.update_target(tau=1.0)