def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) self.inputs = [("obs", self.observations), ("value_targets", self.value_targets), ("advantages", self.advantages), ("actions", self.actions), ("logprobs", self.prev_logits), ("vf_preds", self.prev_vf_preds)] self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs]) # References to the model weights self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"])
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") with tf.variable_scope(POLICY_SCOPE) as scope: self.model = ModelCatalog.get_model( { "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(VALUE_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicy self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (Postprocessing.ADVANTAGES, self.cum_rew_t), ] TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def _initialize_loss(self): def fake_array(tensor, none_shape): shape = tensor.shape.as_list() non_none_shape = [s for s in shape if s is not None] none_shape = none_shape if isinstance(none_shape, list) else [none_shape] shape = none_shape + non_none_shape return np.zeros(shape, dtype=tensor.dtype.as_numpy_dtype) T = self.config["model"]["max_seq_len"] B = self.config["train_batch_size"] // T dummy_batch = { SampleBatch.CUR_OBS: fake_array(self._obs_input, B * T), SampleBatch.NEXT_OBS: fake_array(self._obs_input, B * T), SampleBatch.DONES: np.array([False] * B * T, dtype=np.bool), SampleBatch.ACTIONS: fake_array( ModelCatalog.get_action_placeholder(self.action_space), B * T ), SampleBatch.REWARDS: np.array([0] * B * T, dtype=np.float32), SampleBatch.INFOS: np.array([self.sample_info] * B * T), } if self._obs_include_prev_action_reward: dummy_batch.update( { SampleBatch.PREV_ACTIONS: fake_array(self._prev_action_input, B * T), SampleBatch.PREV_REWARDS: fake_array(self._prev_reward_input, B * T), } ) state_init = self.get_initial_state() state_batches = [] for i, h in enumerate(state_init): dummy_batch["state_in_{}".format(i)] = np.repeat( np.expand_dims(h, 0), B * T, 0 ) dummy_batch["state_out_{}".format(i)] = np.repeat( np.expand_dims(h, 0), B * T, 0 ) state_batches.append(np.repeat(np.expand_dims(h, 0), B * T, 0)) if state_init: dummy_batch["seq_lens"] = np.array([T] * B * T, dtype=np.int32) for k, v in self.extra_compute_action_fetches().items(): dummy_batch[k] = fake_array(v, B * T) # postprocessing might depend on variable init, so run it first here self._sess.run(tf.global_variables_initializer()) postprocessed_batch = self.postprocess_trajectory(SampleBatch(dummy_batch)) # model forward pass for the loss (needed after postprocess to # overwrite any tensor state from that call) self.model(self._input_dict, self._state_in, self._seq_lens) if self._obs_include_prev_action_reward: train_batch = UsageTrackingDict( { SampleBatch.PREV_ACTIONS: self._prev_action_input, SampleBatch.PREV_REWARDS: self._prev_reward_input, SampleBatch.CUR_OBS: self._obs_input, } ) loss_inputs = [ (SampleBatch.PREV_ACTIONS, self._prev_action_input), (SampleBatch.PREV_REWARDS, self._prev_reward_input), (SampleBatch.CUR_OBS, self._obs_input), ] else: train_batch = UsageTrackingDict({SampleBatch.CUR_OBS: self._obs_input}) loss_inputs = [ (SampleBatch.CUR_OBS, self._obs_input), ] for k, v in postprocessed_batch.items(): if k in train_batch: continue elif v.dtype == np.object: continue # can't handle arbitrary objects in TF elif k == "seq_lens" or k.startswith("state_in_"): continue shape = (None,) + v.shape[1:] dtype = np.float32 if v.dtype == np.float64 else v.dtype placeholder = tf.placeholder(dtype, shape=shape, name=k) train_batch[k] = placeholder for i, si in enumerate(self._state_in): train_batch["state_in_{}".format(i)] = si train_batch["seq_lens"] = self._seq_lens if log_once("loss_init"): logger.debug( "Initializing loss function with dummy input:\n\n{}\n".format( summarize(train_batch) ) ) self._loss_input_dict = train_batch loss = self._do_loss_init(train_batch) for k in sorted(train_batch.accessed_keys): if k != "seq_lens" and not k.startswith("state_in_"): loss_inputs.append((k, train_batch[k])) TFPolicy._initialize_loss(self, loss, loss_inputs) if self._grad_stats_fn: self._stats_fetches.update( self._grad_stats_fn(self, train_batch, self._grads) ) self._sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") with tf.variable_scope(P_SCOPE) as scope: self.model = ModelCatalog.get_model({ "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = ( self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer())