def testCustomActionDistribution(self): ray.init() # registration ModelCatalog.register_custom_action_dist("test", CustomActionDistribution) action_space = Box(0, 1, shape=(5, 3), dtype=np.float32) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(str(dist_cls), str(CustomActionDistribution)) self.assertEqual(param_shape, action_space.shape) # test the class works as a distribution dist_input = tf.placeholder(tf.float32, (None, ) + param_shape) dist = dist_cls(dist_input, model_config=model_config) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy() # test passing the options to it model_config["custom_options"].update({"output_dim": (3, )}) dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(param_shape, (3, )) dist_input = tf.placeholder(tf.float32, (None, ) + param_shape) dist = dist_cls(dist_input, model_config=model_config) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy()
def __init__(self, sess, action_space, obs_space, preprocessor, observation_filter, model_config, action_noise_std=0.0): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, model_config, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, action_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, state_values, cumulative_rewards, logits, actions, action_space, beta): ma_adv_norm = tf.get_variable( name="moving_average_of_advantage_norm", dtype=tf.float32, initializer=100.0, trainable=False) # advantage estimation adv = cumulative_rewards - state_values # update averaged advantage norm update_adv_norm = tf.assign_add( ref=ma_adv_norm, value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm)) # exponentially weighted advantages with tf.control_dependencies([update_adv_norm]): exp_advs = tf.exp( beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm))) # log\pi_\theta(a|s) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) action_dist = dist_cls(logits) logprobs = action_dist.logp(actions) self.loss = -1.0 * tf.reduce_mean( tf.stop_gradient(exp_advs) * logprobs)
def __init__(self, obs_space, action_space, config): self.action_space = action_space self.action_noise_std = config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space(obs_space) self.observation_filter = get_filter(config["observation_filter"], self.preprocessor.shape) self.single_threaded = config.get("single_threaded", False) self.sess = make_session(single_threaded=self.single_threaded) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, config["model"], dist_type="deterministic") model = ModelCatalog.get_model({SampleBatch.CUR_OBS: self.inputs}, obs_space, action_space, dist_dim, config["model"]) dist = dist_class(model.outputs, model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, sess, action_space, preprocessor, observation_filter, action_noise_std): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor if observation_filter == "MeanStdFilter": self.observation_filter = MeanStdFilter(self.preprocessor.shape, clip=None) elif observation_filter == "NoFilter": self.observation_filter = NoFilter() else: raise Exception("Unknown observation_filter: " + str("observation_filter")) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, dist_type="deterministic") model = ModelCatalog.get_model(self.inputs, dist_dim) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum([ np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items() ]) self.sess.run(tf.global_variables_initializer())
def test_custom_multi_action_distribution(self): class Model: pass ray.init( object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True ) # otherwise fails sometimes locally # registration ModelCatalog.register_custom_action_dist("test", CustomMultiActionDistribution) s1 = Discrete(5) s2 = Box(0, 1, shape=(3,), dtype=np.float32) spaces = dict(action_1=s1, action_2=s2) action_space = Dict(spaces) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist(action_space, model_config) self.assertIsInstance(dist_cls, partial) self.assertEqual(param_shape, s1.n + 2 * s2.shape[0]) # test the class works as a distribution dist_input = tf1.placeholder(tf.float32, (None, param_shape)) model = Model() model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertIsInstance(dist.sample(), dict) self.assertIn("action_1", dist.sample()) self.assertIn("action_2", dist.sample()) self.assertEqual(dist.sample()["action_1"].dtype, tf.int64) self.assertEqual(dist.sample()["action_2"].shape[1:], s2.shape) with self.assertRaises(NotImplementedError): dist.entropy()
def _initialize(self, ob_space, ac_space, preprocessor, ac_noise_std): self.ac_space = ac_space self.ac_noise_std = ac_noise_std self.preprocessor_shape = preprocessor.transform_shape(ob_space.shape) with tf.variable_scope(type(self).__name__) as scope: # Observation normalization. ob_mean = tf.get_variable( 'ob_mean', self.preprocessor_shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) ob_std = tf.get_variable( 'ob_std', self.preprocessor_shape, tf.float32, tf.constant_initializer(np.nan), trainable=False) in_mean = tf.placeholder(tf.float32, self.preprocessor_shape) in_std = tf.placeholder(tf.float32, self.preprocessor_shape) self._set_ob_mean_std = U.function([in_mean, in_std], [], updates=[ tf.assign(ob_mean, in_mean), tf.assign(ob_std, in_std), ]) inputs = tf.placeholder( tf.float32, [None] + list(self.preprocessor_shape)) # TODO(ekl): we should do clipping in a standard RLlib preprocessor clipped_inputs = tf.clip_by_value( (inputs - ob_mean) / ob_std, -5.0, 5.0) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.ac_space, dist_type='deterministic') model = ModelCatalog.get_model(clipped_inputs, dist_dim) dist = dist_class(model.outputs) self._act = U.function([inputs], dist.sample()) return scope
def __init__(self, registry, sess, action_space, preprocessor, observation_filter): self.sess = sess self.action_space = action_space self.preprocessor = preprocessor self.observation_filter = get_filter( observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder( tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, dist_type="deterministic") model = ModelCatalog.get_model(registry, self.inputs, dist_dim, options={"fcnet_hiddens": [32, 32]}) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum([np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()]) self.sess.run(tf.global_variables_initializer())
def __init__(self, sess, action_space, obs_space, preprocessor, observation_filter, model_config, action_noise_std=0.0): self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, model_config, dist_type="deterministic") model = ModelCatalog.get_model({ "obs": self.inputs }, obs_space, dist_dim, model_config) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) self.inputs = [("obs", self.observations), ("value_targets", self.value_targets), ("advantages", self.advantages), ("actions", self.actions), ("logprobs", self.prev_logits), ("vf_preds", self.prev_vf_preds)] self.common_policy = self.build_tf_loss([ph for _, ph in self.inputs]) # References to the model weights self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"])
def __init__(self, state_values, cumulative_rewards, logits, actions, action_space, beta): ma_adv_norm = tf.get_variable(name="moving_average_of_advantage_norm", dtype=tf.float32, initializer=100.0, trainable=False) # advantage estimation adv = cumulative_rewards - state_values # update averaged advantage norm update_adv_norm = tf.assign_add( ref=ma_adv_norm, value=1e-6 * (tf.reduce_mean(tf.square(adv)) - ma_adv_norm)) # exponentially weighted advantages with tf.control_dependencies([update_adv_norm]): exp_advs = tf.exp(beta * tf.divide(adv, 1e-8 + tf.sqrt(ma_adv_norm))) # log\pi_\theta(a|s) dist_cls, _ = ModelCatalog.get_action_dist(action_space, {}) action_dist = dist_cls(logits) logprobs = action_dist.logp(actions) self.loss = -1.0 * tf.reduce_mean( tf.stop_gradient(exp_advs) * logprobs)
def _get_dist_class(policy: Policy, config: TrainerConfigDict, action_space: gym.spaces.Space) -> \ Type[TFActionDistribution]: """Helper function to return a dist class based on config and action space. Args: policy (Policy): The policy for which to return the action dist class. config (TrainerConfigDict): The Trainer's config dict. action_space (gym.spaces.Space): The action space used. Returns: Type[TFActionDistribution]: A TF distribution class. """ if hasattr(policy, "dist_class") and policy.dist_class is not None: return policy.dist_class elif config["model"].get("custom_action_dist"): action_dist_class, _ = ModelCatalog.get_action_dist(action_space, config["model"], framework="tf") return action_dist_class elif isinstance(action_space, Discrete): return Categorical elif isinstance(action_space, Simplex): return Dirichlet else: assert isinstance(action_space, Box) if config["normalize_actions"]: return SquashedGaussian if \ not config["_use_beta_distribution"] else Beta else: return DiagGaussian
def __init__(self, obs_space, action_space, config): super().__init__(obs_space, action_space, config) self.action_noise_std = self.config["action_noise_std"] self.preprocessor = ModelCatalog.get_preprocessor_for_space( self.observation_space) self.observation_filter = get_filter(self.config["observation_filter"], self.preprocessor.shape) self.single_threaded = self.config.get("single_threaded", False) if self.config["framework"] == "tf": self.sess = make_session(single_threaded=self.single_threaded) # Set graph-level seed. if config.get("seed") is not None: with self.sess.as_default(): tf1.set_random_seed(config["seed"]) self.inputs = tf1.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) else: if not tf1.executing_eagerly(): tf1.enable_eager_execution() self.sess = self.inputs = None if config.get("seed") is not None: # Tf2.x. if config.get("framework") == "tf2": tf.random.set_seed(config["seed"]) # Tf-eager. elif tf1 and config.get("framework") == "tfe": tf1.set_random_seed(config["seed"]) # Policy network. self.dist_class, dist_dim = ModelCatalog.get_action_dist( self.action_space, self.config["model"], dist_type="deterministic") self.model = ModelCatalog.get_model_v2( obs_space=self.preprocessor.observation_space, action_space=self.action_space, num_outputs=dist_dim, model_config=self.config["model"], ) self.sampler = None if self.sess: dist_inputs, _ = self.model({SampleBatch.CUR_OBS: self.inputs}) dist = self.dist_class(dist_inputs, self.model) self.sampler = dist.sample() self.variables = ray.experimental.tf_utils.TensorFlowVariables( dist_inputs, self.sess) self.sess.run(tf1.global_variables_initializer()) else: self.variables = ray.experimental.tf_utils.TensorFlowVariables( [], None, self.model.variables()) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items())
def __init__(self, inputs, model, action_space, name): child_dist = [] input_lens = [] for action in action_space.spaces: dist, action_size = ModelCatalog.get_action_dist(action, {}) child_dist.append(dist) input_lens.append(action_size) super().__init__(inputs, model, action_space, child_dist, input_lens) with tf.variable_scope(name): self.entropy_list = [s.entropy() for s in self.child_distributions]
def test_custom_action_distribution(self): class Model(): pass ray.init(object_store_memory=1000 * 1024 * 1024, ignore_reinit_error=True) # otherwise fails sometimes locally # registration ModelCatalog.register_custom_action_dist("test", CustomActionDistribution) action_space = Box(0, 1, shape=(5, 3), dtype=np.float32) # test retrieving it model_config = MODEL_DEFAULTS.copy() model_config["custom_action_dist"] = "test" dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(str(dist_cls), str(CustomActionDistribution)) self.assertEqual(param_shape, action_space.shape) # test the class works as a distribution dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape) model = Model() model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy() # test passing the options to it model_config["custom_model_config"].update({"output_dim": (3, )}) dist_cls, param_shape = ModelCatalog.get_action_dist( action_space, model_config) self.assertEqual(param_shape, (3, )) dist_input = tf1.placeholder(tf.float32, (None, ) + param_shape) model.model_config = model_config dist = dist_cls(dist_input, model=model) self.assertEqual(dist.sample().shape[1:], dist_input.shape[1:]) self.assertIsInstance(dist.sample(), tf.Tensor) with self.assertRaises(NotImplementedError): dist.entropy()
def build_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=POLICY_SCOPE, framework="tf", ) return policy.model
def _init(self, config, env_creator): self.env = env_creator(config["env_config"]) self.state = {} self._policy = ImitationTFPolicy action_space = self.env.action_space dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.workers = self._make_workers(env_creator, self._policy, config, self.config["num_workers"]) self.execution_plan = default_execution_plan #self.train_exec_impl = self.execution_plan(self.workers, config) self.train_exec_impl = None self.optimizer = ImitationMetrics(self.workers)
def option_critic_make_model_and_action_dist(policy, obs_space, action_space, config): # Basic distribution class should be fine as long as I input the logits corresponding to the correct option dist_class = ModelCatalog.get_action_dist( action_space, config, framework="torch" ) # option critic vision network. May want to revise to register this as a custom model, then grab it model = OptionCriticVisionNetwork( obs_space, action_space, action_space.n, config, 'test') return model, dist_class
def get_distribution_inputs_and_class(policy, model, obs_batch, *, explore=True, is_training=False, **kwargs): model_out, _ = model({ "obs": obs_batch, "is_training": is_training, }, [], None) dist_inputs = model.get_policy_output(model_out) dist_class, logit_dim = ModelCatalog.get_action_dist( model.action_space, policy.config["model"], framework="torch") return dist_inputs, dist_class, [] # []=state out
def make_mu_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"], framework="torch") base_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework="torch") mu_model = MuZeroModel(obs_space, action_space, logit_dim, config["model"], name="MuZeroModel", base_model=base_model) return mu_model
def __init__(self, obs_space, action_space, config): """Target Network is updated by the master learner every trainer.update_target_frequency steps. All worker batches are importance sampled w.r. to the target network to ensure a more stable pi_old in PPO. """ assert config[DELAY_UPDATE] _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) self.target_model = ModelCatalog.get_model_v2(obs_space, action_space, logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="tf") self.model_vars = self.model.variables() self.target_model_vars = self.target_model.variables() self.get_session().run(tf.initialize_variables(self.target_model_vars)) self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") assign_ops = [] assert len(self.model_vars) == len(self.target_model_vars) for var, var_target in zip(self.model_vars, self.target_model_vars): assign_ops.append( var_target.assign(self.tau * var + (1.0 - self.tau) * var_target)) self.update_target_expr = tf.group(*assign_ops) @make_tf_callable(self.get_session(), True) def compute_clone_network_logits(ob): # def compute_clone_network_logits(ob, prev_action, prev_reward): # We do not support recurrent network now. feed_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(ob), # SampleBatch.PREV_REWARDS: tf.convert_to_tensor( # prev_reward), "is_training": tf.convert_to_tensor(False) } # if prev_action is not None: # feed_dict[SampleBatch.PREV_ACTIONS] = tf.convert_to_tensor( # prev_action) model_out, _ = self.target_model(feed_dict) return model_out self._compute_clone_network_logits = compute_clone_network_logits
def make_model_and_action_dist(policy, observation_space, action_space, config): # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, config["model"], # model_options dist_type="deterministic", framework="torch") model = ModelCatalog.get_model_v2(policy.preprocessor.observation_space, action_space, num_outputs=dist_dim, model_config=config["model"], framework="torch") # Make all model params not require any gradients. for p in model.parameters(): p.requires_grad = False return model, dist_class
def __init__(self, obs_space, action_space, config): assert config[DELAY_UPDATE] # Build the target network of this policy. _, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"] ) self.target_model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name="target_func", framework="tf" ) self.model_vars = self.model.variables() self.target_model_vars = self.target_model.variables() self.get_session().run( tf.variables_initializer(self.target_model_vars)) # Here is the delayed update mechanism. self.tau_value = config.get("tau") self.tau = tf.placeholder(tf.float32, (), name="tau") assign_ops = [] assert len(self.model_vars) == len(self.target_model_vars) for var, var_target in zip(self.model_vars, self.target_model_vars): assign_ops.append( var_target. assign(self.tau * var + (1.0 - self.tau) * var_target) ) self.update_target_expr = tf.group(*assign_ops) @make_tf_callable(self.get_session(), True) def compute_clone_network_logits(ob): feed_dict = { SampleBatch.CUR_OBS: tf.convert_to_tensor(ob), "is_training": tf.convert_to_tensor(False) } model_out, _ = self.target_model(feed_dict) return model_out self._compute_clone_network_logits = compute_clone_network_logits
def make_nomad_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"], framework="torch") base_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=logit_dim, model_config=config["model"], framework="torch") nomad_model = NomadModel(obs_space, action_space, logit_dim, config["model"], name="NomadModel", base_model=base_model, order=config["mcts_param"]["order"]) return nomad_model
def build_appo_model(policy, obs_space, action_space, config): _, logit_dim = ModelCatalog.get_action_dist(action_space, config["model"]) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=POLICY_SCOPE, framework="torch" if config["use_pytorch"] else "tf") policy.model_variables = policy.model.variables() policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, logit_dim, config["model"], name=TARGET_POLICY_SCOPE, framework="torch" if config["use_pytorch"] else "tf") policy.target_model_variables = policy.target_model.variables() return policy.model
def __init__(self, sess, action_space, preprocessor, observation_filter, action_noise_std, options={}): if len(preprocessor.shape) > 1: raise UnsupportedSpaceException( "Observation space {} is not supported with ARS.".format( preprocessor.shape)) self.sess = sess self.action_space = action_space self.action_noise_std = action_noise_std self.preprocessor = preprocessor self.observation_filter = get_filter(observation_filter, self.preprocessor.shape) self.inputs = tf.placeholder(tf.float32, [None] + list(self.preprocessor.shape)) # Policy network. dist_class, dist_dim = ModelCatalog.get_action_dist( action_space, dist_type="deterministic") model = ModelCatalog.get_model(self.inputs, dist_dim, options=options) dist = dist_class(model.outputs) self.sampler = dist.sample() self.variables = ray.experimental.TensorFlowVariables( model.outputs, self.sess) self.num_params = sum( np.prod(variable.shape.as_list()) for _, variable in self.variables.variables.items()) self.sess.run(tf.global_variables_initializer())
def build_action_output(policy, model, input_dict, obs_space, action_space, config): logits, _ = model({ "obs": input_dict[SampleBatch.CUR_OBS], "is_training": policy._get_is_training_placeholder(), }, [], None) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, config["model"]) action_dist = dist_class(logits, model) stochastic_actions = action_dist.sample() log_pis = action_dist.sampled_action_logp() deterministic_actions = tf.math.argmax(logits, dimension=-1) actions = tf.cond(policy.stochastic, lambda: stochastic_actions, lambda: deterministic_actions) action_probabilities = tf.cond(policy.stochastic, lambda: log_pis, lambda: tf.zeros_like(log_pis)) policy.output_actions = actions return actions, action_probabilities
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder(tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") with tf.variable_scope(POLICY_SCOPE) as scope: self.model = ModelCatalog.get_model( { "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(VALUE_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = (self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicy self.sess = tf.get_default_session() self.loss_inputs = [ (SampleBatch.CUR_OBS, self.obs_t), (SampleBatch.ACTIONS, self.act_t), (Postprocessing.ADVANTAGES, self.cum_rew_t), ] TFPolicy.__init__(self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__( self, name, batchsize, preprocessor, config, logdir, is_remote): if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = BatchedEnv(name, batchsize, preprocessor=preprocessor) if preprocessor.shape is None: preprocessor.shape = self.env.observation_space.shape if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.preprocessor = preprocessor self.sess = tf.Session(config=config_proto) if config["use_tf_debugger"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) self.observations = tf.placeholder( tf.float32, shape=(None,) + preprocessor.shape) self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder( tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None,)) else: raise NotImplemented( "action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = 1 self.per_device_batch_size = 1 else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, advs, acts, plog): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, advs, acts, plog, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.advantages, self.actions, self.prev_logits], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack( values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.observation_filter = MeanStdFilter(preprocessor.shape, clip=None) self.reward_filter = MeanStdFilter((), clip=5.0) self.sess.run(tf.global_variables_initializer())
def required_model_output_shape(action_space, model_config): input_lens = [] for action in action_space.spaces: dist, action_size = ModelCatalog.get_action_dist(action, {}) input_lens.append(action_size) return sum(input_lens)
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder(name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder(tf.float32, shape=(None, ) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None, )) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None, )) action_space = self.env.action_space # TODO(rliaw): pull this into model_catalog if isinstance(action_space, gym.spaces.Box): self.actions = tf.placeholder(tf.float32, shape=(None, action_space.shape[0])) elif isinstance(action_space, gym.spaces.Discrete): self.actions = tf.placeholder(tf.int64, shape=(None, )) else: raise NotImplemented("action space" + str(type(action_space)) + "currently not supported") self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder(tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None, )) assert config["sgd_batchsize"] % len(devices) == 0, \ "Batch size must be evenly divisible by devices" if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = config["sgd_batchsize"] self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss(self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [ self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds ], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack( values=[policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter(config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.sampler = SyncSampler(self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer())
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.dqn.dqn.DEFAULT_CONFIG, **config) self.config = config dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) # Action inputs self.obs_t = tf.placeholder( tf.float32, shape=(None, ) + observation_space.shape) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder( tf.float32, [None], name="prev_reward") with tf.variable_scope(P_SCOPE) as scope: self.model = ModelCatalog.get_model({ "obs": self.obs_t, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"]) logits = self.model.outputs self.p_func_vars = _scope_vars(scope.name) # Action outputs action_dist = dist_cls(logits) self.output_actions = action_dist.sample() # Training inputs self.act_t = tf.placeholder(tf.int32, [None], name="action") self.cum_rew_t = tf.placeholder(tf.float32, [None], name="reward") # v network evaluation with tf.variable_scope(V_SCOPE) as scope: state_values = self.model.value_function() self.v_func_vars = _scope_vars(scope.name) self.v_loss = self._build_value_loss(state_values, self.cum_rew_t) self.p_loss = self._build_policy_loss(state_values, self.cum_rew_t, logits, self.act_t, action_space) # which kind of objective to optimize objective = ( self.p_loss.loss + self.config["vf_coeff"] * self.v_loss.loss) self.explained_variance = tf.reduce_mean( explained_variance(self.cum_rew_t, state_values)) # initialize TFPolicyGraph self.sess = tf.get_default_session() self.loss_inputs = [ ("obs", self.obs_t), ("actions", self.act_t), ("advantages", self.cum_rew_t), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.obs_t, action_sampler=self.output_actions, action_prob=action_dist.sampled_action_prob(), loss=objective, model=self.model, loss_inputs=self.loss_inputs, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph) self.sess.run(tf.global_variables_initializer()) self.stats_fetches = { "total_loss": objective, "vf_explained_var": self.explained_variance, "policy_loss": self.p_loss.loss, "vf_loss": self.v_loss.loss }
def __init__(self, env_creator, config, is_ext_train=False): self.local_steps = 0 self.config = config self.summarize = config.get("summarize") env = ModelCatalog.get_preprocessor_as_wrapper( env_creator(self.config["env_config"]), self.config["model"]) if is_ext_train: train_dataset = input_fn( self.config["inverse_model"]["ext_train_file_path"]) valid_dataset = input_fn( self.config["inverse_model"]["ext_valid_file_path"]) iterator = tf.data.Iterator.from_structure( train_dataset.output_types, train_dataset.output_shapes) next_element = iterator.get_next() self.x = next_element[0] self.ac = next_element[1] self.training_init_op = iterator.make_initializer(train_dataset) self.validation_init_op = iterator.make_initializer(valid_dataset) else: self.x = tf.placeholder( tf.float32, shape=[ None, numpy.prod([2] + list(env.observation_space.shape)) ]) if isinstance(env.action_space, gym.spaces.Box): self.ac = tf.placeholder(tf.float32, [None] + list(env.action_space.shape), name="ac") elif isinstance(env.action_space, gym.spaces.Discrete): self.ac = tf.placeholder(tf.int64, [None], name="ac") else: raise NotImplementedError("action space" + str(type(env.action_space)) + "currently not supported") # Setup graph dist_class, logit_dim = ModelCatalog.get_action_dist( env.action_space, self.config["model"]) self._model = FullyConnectedNetwork(self.x, logit_dim, {}) self.logits = self._model.outputs self.curr_dist = dist_class(self.logits) self.sample = self.curr_dist.sample() self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup loss log_prob = self.curr_dist.logp(self.ac) self.pi_loss = -tf.reduce_sum(log_prob) self.loss = self.pi_loss self.optimizer = tf.train.AdamOptimizer(self.config["lr"]).minimize( self.loss) # Setup similarity -> cosine similarity normalize_sample = tf.nn.l2_normalize(self.sample, 1) normalize_ac = tf.nn.l2_normalize(self.ac, 1) self.similarity = 1 - tf.losses.cosine_distance( normalize_sample, normalize_ac, dim=1) # Initialize self.initialize()
def __init__(self, registry, env_creator, config, logdir, is_remote): self.registry = registry self.is_remote = is_remote if is_remote: os.environ["CUDA_VISIBLE_DEVICES"] = "" devices = ["/cpu:0"] else: devices = config["devices"] self.devices = devices self.config = config self.logdir = logdir self.env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) if is_remote: config_proto = tf.ConfigProto() else: config_proto = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=config_proto) if config["tf_debug_inf_or_nan"] and not is_remote: self.sess = tf_debug.LocalCLIDebugWrapperSession(self.sess) self.sess.add_tensor_filter( "has_inf_or_nan", tf_debug.has_inf_or_nan) # Defines the training inputs: # The coefficient of the KL penalty. self.kl_coeff = tf.placeholder( name="newkl", shape=(), dtype=tf.float32) # The input observations. self.observations = tf.placeholder( tf.float32, shape=(None,) + self.env.observation_space.shape) # Targets of the value function. self.value_targets = tf.placeholder(tf.float32, shape=(None,)) # Advantage values in the policy gradient estimator. self.advantages = tf.placeholder(tf.float32, shape=(None,)) action_space = self.env.action_space self.actions = ModelCatalog.get_action_placeholder(action_space) self.distribution_class, self.logit_dim = ModelCatalog.get_action_dist( action_space) # Log probabilities from the policy before the policy update. self.prev_logits = tf.placeholder( tf.float32, shape=(None, self.logit_dim)) # Value function predictions before the policy update. self.prev_vf_preds = tf.placeholder(tf.float32, shape=(None,)) if is_remote: self.batch_size = config["rollout_batchsize"] self.per_device_batch_size = config["rollout_batchsize"] else: self.batch_size = int( config["sgd_batchsize"] / len(devices)) * len(devices) assert self.batch_size % len(devices) == 0 self.per_device_batch_size = int(self.batch_size / len(devices)) def build_loss(obs, vtargets, advs, acts, plog, pvf_preds): return ProximalPolicyLoss( self.env.observation_space, self.env.action_space, obs, vtargets, advs, acts, plog, pvf_preds, self.logit_dim, self.kl_coeff, self.distribution_class, self.config, self.sess, self.registry) self.par_opt = LocalSyncParallelOptimizer( tf.train.AdamOptimizer(self.config["sgd_stepsize"]), self.devices, [self.observations, self.value_targets, self.advantages, self.actions, self.prev_logits, self.prev_vf_preds], self.per_device_batch_size, build_loss, self.logdir) # Metric ops with tf.name_scope("test_outputs"): policies = self.par_opt.get_device_losses() self.mean_loss = tf.reduce_mean( tf.stack(values=[ policy.loss for policy in policies]), 0) self.mean_policy_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_policy_loss for policy in policies]), 0) self.mean_vf_loss = tf.reduce_mean( tf.stack(values=[ policy.mean_vf_loss for policy in policies]), 0) self.mean_kl = tf.reduce_mean( tf.stack(values=[ policy.mean_kl for policy in policies]), 0) self.mean_entropy = tf.reduce_mean( tf.stack(values=[ policy.mean_entropy for policy in policies]), 0) # References to the model weights self.common_policy = self.par_opt.get_common_loss() self.variables = ray.experimental.TensorFlowVariables( self.common_policy.loss, self.sess) self.obs_filter = get_filter( config["observation_filter"], self.env.observation_space.shape) self.rew_filter = MeanStdFilter((), clip=5.0) self.filters = {"obs_filter": self.obs_filter, "rew_filter": self.rew_filter} self.sampler = SyncSampler( self.env, self.common_policy, self.obs_filter, self.config["horizon"], self.config["horizon"]) self.sess.run(tf.global_variables_initializer())