def build_sac_model(policy, obs_space, action_space, config): if config["model"].get("custom_model"): logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, (Box, Discrete)): raise UnsupportedSpaceException( "Action space {} is not supported for SAC.".format(action_space)) if isinstance(action_space, Box) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") # # infer num_outpus as action space dim (not embedding size!!) # _, num_outputs = ModelCatalog.get_action_dist( # action_space, config["model"], framework="torch") num_outputs = action_space.n # Force-ignore any additionally provided hidden layer sizes. # Everything should be configured using SAC's "Q_model" and "policy_model" # settings. policy.model = BaselineSACTorchModel( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], name="sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"], # customs embed_dim =config["embed_dim"], encoder_type=config["encoder_type"]) policy.target_model = BaselineSACTorchModel( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], name="target_sac_model", actor_hidden_activation=config["policy_model"]["fcnet_activation"], actor_hiddens=config["policy_model"]["fcnet_hiddens"], critic_hidden_activation=config["Q_model"]["fcnet_activation"], critic_hiddens=config["Q_model"]["fcnet_hiddens"], twin_q=config["twin_q"], initial_alpha=config["initial_alpha"], target_entropy=config["target_entropy"], # customs embed_dim =config["embed_dim"], encoder_type=config["encoder_type"]) return policy.model
def validate_spaces(policy: Policy, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: """Validates the observation- and action spaces used for the Policy. Args: policy (Policy): The policy, whose spaces are being validated. observation_space (gym.spaces.Space): The observation space to validate. action_space (gym.spaces.Space): The action space to validate. config (TrainerConfigDict): The Policy's config dict. Raises: UnsupportedSpaceException: If one of the spaces is not supported. """ # Only support single Box or single Discrete spaces. if not isinstance(action_space, (Box, Discrete, Simplex)): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " "SAC. Must be [Box|Discrete|Simplex].".format( action_space, policy)) # If Box, make sure it's a 1D vector space. elif isinstance(action_space, (Box, Simplex)) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, policy, action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.")
def build_ddpg_model(policy, obs_space, action_space, config): if config["model"]["custom_model"]: logger.warning( "Setting use_state_preprocessor=True since a custom model " "was specified.") config["use_state_preprocessor"] = True if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format(action_space)) if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") if config["use_state_preprocessor"]: default_model = None # catalog decides num_outputs = 256 # arbitrary config["model"]["no_final_linear"] = True else: default_model = NoopModel num_outputs = int(np.product(obs_space.shape)) policy.model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], parameter_noise=config["parameter_noise"], twin_q=config["twin_q"]) policy.target_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DDPGModel, default_model=default_model, name="target_ddpg_model", actor_hidden_activation=config["actor_hidden_activation"], actor_hiddens=config["actor_hiddens"], critic_hidden_activation=config["critic_hidden_activation"], critic_hiddens=config["critic_hiddens"], parameter_noise=config["parameter_noise"], twin_q=config["twin_q"]) return policy.model
def validate_spaces(pid, observation_space, action_space, config): if not isinstance(action_space, (Box, Discrete)): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " "SAC.".format(action_space, pid)) if isinstance(action_space, Box) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, pid, action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.")
def validate_spaces(pid: PolicyID, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> None: if not isinstance(action_space, Box): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " "DDPG.".format(action_space, pid)) elif len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, pid, action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.")
def build_q_networks(policy, input_dict, observation_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) # Action Q network with tf.variable_scope(Q_SCOPE) as scope: q_values, q_logits, q_dist, _ = _build_q_network( policy, input_dict[SampleBatch.CUR_OBS], observation_space, action_space) policy.q_values = q_values policy.q_func_vars = _scope_vars(scope.name) # Noise vars for Q network except for layer normalization vars if config["parameter_noise"]: _build_parameter_noise( policy, [var for var in policy.q_func_vars if "LayerNorm" not in var.name]) policy.action_probs = tf.nn.softmax(policy.q_values) # Action outputs qvp = QValuePolicy(q_values, input_dict[SampleBatch.CUR_OBS], action_space.n, policy.stochastic, policy.eps, policy.config["soft_q"], policy.config["softmax_temp"]) policy.output_actions, policy.action_prob = qvp.action, qvp.action_prob return policy.output_actions, policy.action_prob
def _make_box_from_dict(space): """ Convert a spaces.Dict to a spaces.Box """ sp = list(space.spaces.values()) lows = [] highs = [] for s in sp: if isinstance(s, gym.spaces.Discrete): highs.append(s.n) lows.append(0) elif isinstance(s, gym.spaces.MultiBinary): sh = reduce(lambda x, y: x * y, s.shape) highs += [1] * sh lows += [0] * sh elif isinstance(s, gym.spaces.Box): highs += s.high.flatten().tolist() lows += s.low.flatten().tolist() else: raise UnsupportedSpaceException( "Space {} is not supported.".format(space)) highs = np.asarray(highs) lows = np.asarray(lows) return gym.spaces.Box(high=highs, low=lows)
def validate_spaces( policy: Policy, observation_space: gym.spaces.Space, action_space: gym.spaces.Space, config: AlgorithmConfigDict, ) -> None: """Validates the observation- and action spaces used for the Policy. Args: policy: The policy, whose spaces are being validated. observation_space: The observation space to validate. action_space: The action space to validate. config: The Policy's config dict. Raises: UnsupportedSpaceException: If one of the spaces is not supported. """ # Only support single Box or single Discrete spaces. if not isinstance(action_space, gym.spaces.Discrete): msg = ( f"Action space ({action_space}) of {policy} is not supported for " f"Bandit algorithms. Must be `Discrete`." ) # Hint at using the MultiDiscrete to Discrete wrapper for Bandits. if isinstance(action_space, gym.spaces.MultiDiscrete): msg += ( " Try to wrap your environment with the " "`ray.rllib.env.wrappers.recsim::" "MultiDiscreteToDiscreteActionWrapper` class: `tune.register_env(" "[some str], lambda ctx: MultiDiscreteToDiscreteActionWrapper(" "[your gym env])); config = {'env': [some str]}`" ) raise UnsupportedSpaceException(msg)
def build_q_models(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: num_outputs = 256 config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name=Q_SCOPE, model_interface=SimpleQModel, q_hiddens=config["hiddens"]) policy.target_q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", name=Q_TARGET_SCOPE, model_interface=SimpleQModel, q_hiddens=config["hiddens"]) return policy.q_model
def build_q_models(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def build_q_models(policy, obs_space, action_space, config): policy.log_stats = config["log_stats"] if policy.log_stats: policy.stats_dict = {} policy.stats_fn = config["stats_fn"] if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.device = (torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")) default_model = RNNModel if config[ "recurrent_dqn"] else FullyConnectedNetwork policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], default_model=default_model, name=Q_SCOPE).to(policy.device) policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], default_model=default_model, name=Q_TARGET_SCOPE).to(policy.device) policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def setup_loss(self, action_space): if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] self.ac = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): self.ac = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) self.adv = tf.placeholder(tf.float32, [None], name="adv") self.r = tf.placeholder(tf.float32, [None], name="r") log_prob = self.action_dist.logp(self.ac) # The "policy gradients" loss: its derivative is precisely the policy # gradient. Notice that self.ac is a placeholder that is provided # externally. adv will contain the advantages, as calculated in # compute_advantages. self.pi_loss = -tf.reduce_sum(log_prob * self.adv) delta = self.vf - self.r self.vf_loss = 0.5 * tf.reduce_sum(tf.square(delta)) self.entropy = tf.reduce_sum(self.action_dist.entropy()) self.loss = (self.pi_loss + self.vf_loss * self.config["vf_loss_coeff"] + self.entropy * self.config["entropy_coeff"])
def build_q_model_and_distribution_comp(policy, obs_space, action_space, config): # Keys of the observation space that must be used at train and test time policy.train_obs_keys = config["train_obs_keys"] policy.test_obs_keys = config["test_obs_keys"] # Check whether policy observation space is inside a Tuple space policy.requires_tupling = False if isinstance(action_space, Tuple) and len(action_space.spaces) == 1: policy.action_space = action_space.spaces[0] action_space = action_space.spaces[0] policy.requires_tupling = True if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) # Get real observation space if isinstance(obs_space, Box): assert hasattr(obs_space, "original_space"), "Invalid observation space" obs_space = obs_space.original_space if isinstance(obs_space, Tuple): obs_space = obs_space.spaces[0] assert isinstance(obs_space, Dict), "Invalid observation space" policy.has_action_mask = "action_mask" in obs_space.spaces assert all([k in obs_space.spaces for k in policy.train_obs_keys ]), "Invalid train keys specification" assert all([k in obs_space.spaces for k in policy.test_obs_keys ]), "Invalid test keys specification" # Get observation space used for training if config["train_obs_space"] is None: train_obs_space = obs_space else: train_obs_space = config["train_obs_space"] if isinstance(train_obs_space, Box): assert hasattr(train_obs_space, "original_space"), "Invalid observation space" train_obs_space = train_obs_space.original_space if isinstance(train_obs_space, Tuple): train_obs_space = train_obs_space.spaces[0] # Obs spaces used for training and testing sp = Dict({k: obs_space.spaces[k] for k in policy.test_obs_keys}) policy.real_test_obs_space = flatten_space(sp) policy.real_test_obs_space.original_space = sp sp = Dict({k: train_obs_space.spaces[k] for k in policy.train_obs_keys}) policy.real_train_obs_space = flatten_space(sp) policy.real_train_obs_space.original_space = sp policy.n_actions = action_space.n model_space = Dict({ k: obs_space.spaces[k] for k in policy.test_obs_keys if k != "action_mask" and k != "signal" }) return build_q_models(policy, flatten_space(model_space), action_space, config), \ TorchCategorical
def build_q_model(policy: Policy, obs_space: gym.Space, action_space: gym.Space, config: TrainerConfigDict) -> ModelV2: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance( getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="tf", model_interface=DistributionalQTFModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=isinstance( getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") return policy.q_model
def build_q_model_and_distribution(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm = ( isinstance(getattr(policy, "exploration", None), ParameterNoise) or config["exploration_config"]["type"] == "ParameterNoise") policy.q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_SCOPE, dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, decompose_num=config["decompose_num"]) policy.q_func_vars = policy.q_model.variables() policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=num_outputs, model_config=config["model"], framework="torch", model_interface=DQNTorchModel, name=Q_TARGET_SCOPE, dueling=config["dueling"], q_hiddens=config["hiddens"], use_noisy=config["noisy"], sigma0=config["sigma0"], # TODO(sven): Move option to add LayerNorm after each Dense # generically into ModelCatalog. add_layer_norm=add_layer_norm, decompose_num=config["decompose_num"]) policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model, TorchMultiObjCategorical
def __init__(self, observation_space, action_space, config): config = dict(ray.rllib.agents.a3c.a3c.DEFAULT_CONFIG, **config) self.config = config self.sess = tf.get_default_session() # Setup the policy self.observations = tf.placeholder( tf.float32, [None] + list(observation_space.shape)) dist_class, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) self.model = ModelCatalog.get_model(self.observations, logit_dim, self.config["model"]) action_dist = dist_class(self.model.outputs) self.vf = tf.reshape( linear(self.model.last_layer, 1, "value", normc_initializer(1.0)), [-1]) self.var_list = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, tf.get_variable_scope().name) # Setup the policy loss if isinstance(action_space, gym.spaces.Box): ac_size = action_space.shape[0] actions = tf.placeholder(tf.float32, [None, ac_size], name="ac") elif isinstance(action_space, gym.spaces.Discrete): actions = tf.placeholder(tf.int64, [None], name="ac") else: raise UnsupportedSpaceException( "Action space {} is not supported for A3C.".format( action_space)) advantages = tf.placeholder(tf.float32, [None], name="advantages") v_target = tf.placeholder(tf.float32, [None], name="v_target") self.loss = A3CLoss(action_dist, actions, advantages, v_target, self.vf, self.config["vf_loss_coeff"], self.config["entropy_coeff"]) # Initialize TFPolicyGraph loss_in = [ ("obs", self.observations), ("actions", actions), ("advantages", advantages), ("value_targets", v_target), ] TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=self.observations, action_sampler=action_dist.sample(), loss=self.loss.total_loss, loss_inputs=loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, seq_lens=self.model.seq_lens, max_seq_len=self.config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer())
def _make_continuous_space(space): if isinstance(space, Box): return space elif isinstance(space, Discrete): return Box(low=np.zeros((space.n, )), high=np.ones((space.n, ))) else: raise UnsupportedSpaceException( "Space {} is not supported.".format(space))
def __init__(self, env: gym.Env): super().__init__(env) if not isinstance(env.action_space, MultiDiscrete): raise UnsupportedSpaceException( f"Action space {env.action_space} " f"is not supported by {self.__class__.__name__}") self.action_space_dimensions = env.action_space.nvec self.action_space = Discrete(np.prod(self.action_space_dimensions))
def __init__(self, observation_space, action_space, config): # Validate spaces. # Only support single Box or single Discrete spaces. if not isinstance(action_space, (Box, Discrete)): raise UnsupportedSpaceException( "Action space ({}) of {} is not supported for " "MB-MPO. Must be [Box|Discrete].".format(action_space, self)) # If Box, make sure it's a 1D vector space. elif isinstance(action_space, Box) and len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space ({}) of {} has multiple dimensions " "{}. ".format(action_space, self, action_space.shape) + "Consider reshaping this into a single dimension Box space " "or using the multi-agent API.") config = dict(ray.rllib.algorithms.mbmpo.mbmpo.DEFAULT_CONFIG, **config) super().__init__(observation_space, action_space, config)
def get_action_dist(action_space, config, dist_type=None, torch=False): """Returns action distribution class and size for the given action space. Args: action_space (Space): Action space of the target gym env. config (dict): Optional model config. dist_type (str): Optional identifier of the action distribution. torch (bool): Optional whether to return PyTorch distribution. Returns: dist_class (ActionDistribution): Python class of the distribution. dist_dim (int): The size of the input vector to the distribution. """ config = config or MODEL_DEFAULTS if isinstance(action_space, gym.spaces.Box): if len(action_space.shape) > 1: raise UnsupportedSpaceException( "Action space has multiple dimensions " "{}. ".format(action_space.shape) + "Consider reshaping this into a single dimension, " "using a Tuple action space, or the multi-agent API.") if dist_type is None: dist = TorchDiagGaussian if torch else DiagGaussian return dist, action_space.shape[0] * 2 elif dist_type == "deterministic": return Deterministic, action_space.shape[0] elif isinstance(action_space, gym.spaces.Discrete): dist = TorchCategorical if torch else Categorical return dist, action_space.n elif isinstance(action_space, gym.spaces.Tuple): child_dist = [] input_lens = [] for action in action_space.spaces: dist, action_size = ModelCatalog.get_action_dist( action, config) child_dist.append(dist) input_lens.append(action_size) if torch: raise NotImplementedError return partial(MultiActionDistribution, child_distributions=child_dist, action_space=action_space, input_lens=input_lens), sum(input_lens) elif isinstance(action_space, Simplex): if torch: raise NotImplementedError return Dirichlet, action_space.shape[0] elif isinstance(action_space, gym.spaces.multi_discrete.MultiDiscrete): if torch: raise NotImplementedError return partial(MultiCategorical, input_lens=action_space.nvec), \ int(sum(action_space.nvec)) raise NotImplementedError("Unsupported args: {} {}".format( action_space, dist_type))
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config # when env.action_space is of Box type, e.g., Pendulum-v0 # action_space.low is [-2.0], high is [2.0] # take action by calling, e.g., env.step([3.5]) if not isinstance(env.action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( config["noise_scale"] * 0.4 ** (1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=config["noise_scale"] * 1.0, final_p=config["noise_scale"] * config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) # hard instead of soft self.ddpg_graph.update_target(self.sess, 1.0) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the policy and Q-value networks and # their corresponding target networks self.variables = ray.experimental.TensorFlowVariables( tf.group(self.ddpg_graph.q_tp0, self.ddpg_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
def build_q_model(policy, obs_space, action_space, config): if not isinstance(action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) if config["hiddens"]: # try to infer the last layer size, otherwise fall back to 256 num_outputs = ([256] + config["model"]["fcnet_hiddens"])[-1] config["model"]["no_final_linear"] = True else: num_outputs = action_space.n policy.q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DistributionalQModel, name=Q_SCOPE, num_atoms=config["num_atoms"], q_hiddens=config["hiddens"], dueling=config["dueling"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], parameter_noise=config["parameter_noise"]) policy.target_q_model = ModelCatalog.get_model_v2( obs_space, action_space, num_outputs, config["model"], framework="tf", model_interface=DistributionalQModel, name=Q_TARGET_SCOPE, num_atoms=config["num_atoms"], q_hiddens=config["hiddens"], dueling=config["dueling"], use_noisy=config["noisy"], v_min=config["v_min"], v_max=config["v_max"], sigma0=config["sigma0"], parameter_noise=config["parameter_noise"]) return policy.q_model
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Box): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.ddpg_graph = models.DDPGGraph(registry, env, config, logdir) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.ddpg_graph.copy_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 nb_actions = env.action_space.shape[-1] stddev = config["exploration_noise"] self.exploration_noise = OUNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) self.action_range = (-1., 1.) # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.ddpg_graph.critic_loss, self.ddpg_graph.action_loss), self.sess) self.max_action = env.action_space.high self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None # Technically not needed when not remote self.obs_filter = get_filter(config["observation_filter"], env.observation_space.shape) self.rew_filter = get_filter(config["reward_filter"], ()) self.filters = { "obs_filter": self.obs_filter, "rew_filter": self.rew_filter } self.obs = self.env.reset()
def _build_q_models(policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict) -> ModelV2: """Build q_model and target_q_model for Simple Q learning Note that this function works for both Tensorflow and PyTorch. Args: policy (Policy): The Policy, which will use the model for optimization. obs_space (gym.spaces.Space): The policy's observation space. action_space (gym.spaces.Space): The policy's action space. config (TrainerConfigDict): Returns: ModelV2: The Model for the Policy to use. Note: The target q model will not be returned, just assigned to `policy.target_q_model`. """ if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format(action_space)) policy.q_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_SCOPE) if torch.cuda.is_available(): policy.q_model = policy.q_model.to("cuda") policy.target_q_model = ModelCatalog.get_model_v2( obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=Q_TARGET_SCOPE) if torch.cuda.is_available(): policy.target_q_model = policy.target_q_model.to("cuda") policy.q_func_vars = policy.q_model.variables() policy.target_q_func_vars = policy.target_q_model.variables() return policy.q_model
def build_avg_model_and_distribution( policy: Policy, obs_space: gym.spaces.Space, action_space: gym.spaces.Space, config: TrainerConfigDict ) -> Tuple[ModelV2, Type[TorchDistributionWrapper]]: if not isinstance(action_space, gym.spaces.Discrete): raise UnsupportedSpaceException( f"Action space {action_space} is not supported for NFSP.") policy.avg_model = ModelCatalog.get_model_v2(obs_space=obs_space, action_space=action_space, num_outputs=action_space.n, model_config=config["model"], framework=config["framework"], name=AVG_POL_SCOPE) policy.avg_func_vars = policy.avg_model.variables() return policy.avg_model, TorchCategorical
def __init__(self, registry, env_creator, config, logdir, worker_index): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"], config["random_starts"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Use either a different `eps` per worker, or a linear schedule. if config["per_worker_exploration"]: assert config["num_workers"] > 1, "This requires multiple workers" self.exploration = ConstantSchedule( 0.4 ** ( 1 + worker_index / float(config["num_workers"] - 1) * 7)) else: self.exploration = LinearSchedule( schedule_timesteps=int( config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
def _make_array_from_obs(obs, size, spaces): """ Transform original obs dict to one dimensional np.array :param obs: dict, original observation dictionary :param size: total size of the wrapped env :return: np.array, flatten out array of observations """ # get size of space # initialize zeros array with correct shape array = np.zeros(size) # get space dict offset = 0 # for every observation for k in spaces.keys(): # get gym space related to observation sp = spaces[k] v = obs[k] # if MultiBinary, get shape and add values to array if isinstance(sp, gym.spaces.MultiBinary): size = reduce(lambda x, y: x * y, sp.shape) array[offset:offset + size] = v # if Discrete then we need to use the OHV rappresentation, and set n to be one elif isinstance(sp, gym.spaces.Discrete): size = sp.n array[offset + v] = 1 # if Box, then get size and assign flatten value elif isinstance(sp, gym.spaces.Box): size = reduce(lambda x, y: x * y, sp.shape) array[offset:offset + size] = v.flatten() # else raise exception else: raise UnsupportedSpaceException(f"space {type(sp)} is not supported for ParametricWrapper") # update offset offset += size return np.asarray(array)
def _make_box_from_obs(space): """ Convert a spaces.Dict to a spaces.Box given highs/lows vectors initialization. :param space: gym.spaces.Dict :return: gym.spaces.Box """ sp = list(space.spaces.values()) lows = [] highs = [] # for every space for s in sp: # if discrete then the observation will be transformed to a OneHotVector rapresentation to deal with # discrete values, so add n 0/1 as lows/highs if isinstance(s, gym.spaces.Discrete): highs += [1] * s.n lows += [0] * s.n # if multibinary then do the same as before but get shape with reduce elif isinstance(s, gym.spaces.MultiBinary): sh = reduce(lambda x, y: x * y, s.shape) highs += [1] * sh lows += [0] * sh # if box then just flatten highs and flows elif isinstance(s, gym.spaces.Box): highs += s.high.flatten().tolist() lows += s.low.flatten().tolist() # else raise exception else: raise UnsupportedSpaceException( "Space {} is not supported.".format(space)) # convert to array highs = np.asarray(highs) lows = np.asarray(lows) # return box as high/low initialization return gym.spaces.Box(high=highs, low=lows)
def __init__(self, registry, env_creator, config, logdir): env = env_creator(config["env_config"]) env = wrap_dqn(registry, env, config["model"]) self.env = env self.config = config if not isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DQN.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.dqn_graph = models.DQNGraph(registry, env, config, logdir) # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule( schedule_timesteps=int(config["exploration_fraction"] * config["schedule_max_timesteps"]), initial_p=1.0, final_p=config["exploration_final_eps"]) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.dqn_graph.update_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.dqn_graph.q_t, self.dqn_graph.q_tp1), self.sess) self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()
def __init__(self, registry, env_creator, config, worker_index): env = ModelCatalog.get_preprocessor_as_wrapper( registry, env_creator(config["env_config"]), config["model"]) self.env = env self.config = config if isinstance(env.action_space, Discrete): raise UnsupportedSpaceException( "Action space {} is not supported for DDPG.".format( env.action_space)) tf_config = tf.ConfigProto(**config["tf_session_args"]) self.sess = tf.Session(config=tf_config) self.ddpg_graph = models.DDPGGraph(registry, env, config) # Initialize the parameters and copy them to the target network. self.sess.run(tf.global_variables_initializer()) self.ddpg_graph.copy_target(self.sess) self.global_timestep = 0 self.local_timestep = 0 nb_actions = env.action_space.shape[-1] stddev = config["exploration_noise"] self.exploration_noise = OUNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) self.action_range = (-1., 1.) # Note that this encompasses both the Q and target network self.variables = ray.experimental.TensorFlowVariables( tf.group(self.ddpg_graph.td_error, self.ddpg_graph.action_lost), self.sess) self.max_action = env.action_space.high self.episode_rewards = [0.0] self.episode_lengths = [0.0] self.saved_mean_reward = None self.obs = self.env.reset()