def rollout(env, policy, path_length, render=False, speedup=None): Da = flat_dim(env.action_space) Do = flat_dim(env.observation_space) observation = env.reset() policy.reset() observations = np.zeros((path_length + 1, Do)) actions = np.zeros((path_length, Da)) terminals = np.zeros((path_length, )) rewards = np.zeros((path_length, )) agent_infos = [] env_infos = [] t = 0 for t in range(path_length): action, agent_info = policy.get_action(observation) next_obs, reward, terminal, env_info = env.step(action) agent_infos.append(agent_info) env_infos.append(env_info) actions[t] = action terminals[t] = terminal rewards[t] = reward observations[t] = observation observation = next_obs if render: env.render() time_step = 0.05 time.sleep(time_step / speedup) if terminal: break observations[t + 1] = observation path = { 'observations': observations[:t + 1], 'actions': actions[:t + 1], 'rewards': rewards[:t + 1], 'terminals': terminals[:t + 1], 'next_observations': observations[1:t + 2], 'agent_infos': agent_infos, 'env_infos': env_infos } return path
def __init__(self, env_spec, q_functions): Serializable.quick_init(self, locals()) self.q_functions = q_functions self._Da = flat_dim(env_spec.action_space) self._Do = flat_dim(env_spec.observation_space) self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._Do], name='observations') self._actions_ph = tf.placeholder( tf.float32, shape=[None, self._Da], name='actions') self._output = self.output_for( self._observations_ph, self._actions_ph, reuse=True)
def __init__( self, env, scale_reward=1., normalize_obs=False, normalize_reward=False, flatten_obs=True, obs_alpha=0.001, reward_alpha=0.001, ): Serializable.quick_init(self, locals()) super(NormalizedEnv, self).__init__(env) self._scale_reward = scale_reward self._normalize_obs = normalize_obs self._normalize_reward = normalize_reward self._flatten_obs = flatten_obs self._obs_alpha = obs_alpha flat_obs_dim = flat_dim(env.observation_space) self._obs_mean = np.zeros(flat_obs_dim) self._obs_var = np.ones(flat_obs_dim) self._reward_alpha = reward_alpha self._reward_mean = 0. self._reward_var = 1.
def clear_patch(hfield, box): ''' Clears a patch shaped like box, assuming robot is placed in center of hfield @param box: garage.spaces.Box-like ''' if flat_dim(box) > 2: raise ValueError("Provide 2dim box") # clear patch h_center = int(0.5 * hfield.shape[0]) w_center = int(0.5 * hfield.shape[1]) fromrow, torow = w_center + int(box.low[0] / STEP), w_center + int( box.high[0] / STEP) fromcol, tocol = h_center + int(box.low[1] / STEP), h_center + int( box.high[1] / STEP) hfield[fromrow:torow, fromcol:tocol] = 0.0 # convolve to smoothen edges somewhat, in case hills were cut off K = np.ones((10, 10)) / 100.0 s = convolve2d(hfield[fromrow - 9:torow + 9, fromcol - 9:tocol + 9], K, mode='same', boundary='symm') hfield[fromrow - 9:torow + 9, fromcol - 9:tocol + 9] = s return hfield
def __init__(self, env_spec, hidden_sizes=(64, 64), name="ContinuousMLPPolicy", hidden_nonlinearity=tf.nn.relu, output_nonlinearity=tf.nn.tanh, input_include_goal=False, bn=False): """ Initialize class with multiple attributes. Args: env_spec(): hidden_sizes(list or tuple, optional): A list of numbers of hidden units for all hidden layers. name(str, optional): A str contains the name of the policy. hidden_nonlinearity(optional): An activation shared by all fc layers. output_nonlinearity(optional): An activation used by the output layer. bn(bool, optional): A bool to indicate whether normalize the layer or not. """ assert isinstance(env_spec.action_space, Box) Serializable.quick_init(self, locals()) super(ContinuousMLPPolicy, self).__init__(env_spec) self.name = name self._env_spec = env_spec if input_include_goal: obs_dim = flat_dim( env_spec.observation_space.spaces["observation"]) goal_dim = flat_dim( env_spec.observation_space.spaces["desired_goal"]) self._obs_dim = obs_dim + goal_dim else: self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._action_bound = env_spec.action_space.high self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._output_nonlinearity = output_nonlinearity self._batch_norm = bn self._policy_network_name = "policy_network"
def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='q_function'): Serializable.quick_init(self, locals()) self._Da = flat_dim(env_spec.action_space) self._Do = flat_dim(env_spec.observation_space) self._observations_ph = tf.placeholder( tf.float32, shape=[None, self._Do], name='observations') self._actions_ph = tf.placeholder( tf.float32, shape=[None, self._Da], name='actions') super(NNQFunction, self).__init__( inputs=(self._observations_ph, self._actions_ph), name=name, hidden_layer_sizes=hidden_layer_sizes)
def __init__( self, env, obs_noise=1e-1, ): Serializable.quick_init(self, locals()) super(NoisyObservationEnv, self).__init__(env) self.obs_noise = obs_noise self._action_flat_dim = flat_dim(self.action_space)
def __init__(self, env_spec, name="ContinuousMLPQFunction", hidden_sizes=(32, 32), hidden_nonlinearity=tf.nn.relu, action_merge_layer=-2, output_nonlinearity=None, input_include_goal=False, bn=False): """ Initialize class with multiple attributes. Args: env_spec(): name(str, optional): A str contains the name of the policy. hidden_sizes(list or tuple, optional): A list of numbers of hidden units for all hidden layers. hidden_nonlinearity(optional): An activation shared by all fc layers. action_merge_layer(int, optional): An index to indicate when to merge action layer. output_nonlinearity(optional): An activation used by the output layer. bn(bool, optional): A bool to indicate whether normalize the layer or not. """ Serializable.quick_init(self, locals()) self.name = name self._env_spec = env_spec if input_include_goal: obs_dim = flat_dim( env_spec.observation_space.spaces["observation"]) goal_dim = flat_dim( env_spec.observation_space.spaces["desired_goal"]) self._obs_dim = obs_dim + goal_dim else: self._obs_dim = env_spec.observation_space.flat_dim self._action_dim = env_spec.action_space.flat_dim self._hidden_sizes = hidden_sizes self._hidden_nonlinearity = hidden_nonlinearity self._action_merge_layer = action_merge_layer self._output_nonlinearity = output_nonlinearity self._batch_norm = bn
def __init__( self, env, action_delay=3, ): assert action_delay > 0, "Should not use this env transformer" Serializable.quick_init(self, locals()) super(DelayedActionEnv, self).__init__(env) self.action_delay = action_delay self._action_flat_dim = flat_dim(self.action_space) self._queued_actions = None
def __init__( self, env, obs_noise=1e-1, ): super().__init__(env) self.obs_noise = obs_noise self._action_flat_dim = flat_dim(self.action_space) # Always call Serializable constructor last Serializable.quick_init(self, locals())
def __init__(self, env_spec, hidden_layer_sizes, squash=True, name='policy'): Serializable.quick_init(self, locals()) self._action_dim = flat_dim(env_spec.action_space) self._observation_dim = flat_dim(env_spec.observation_space) self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] self._squash = squash self._name = name self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation') self._actions = self.actions_for(self._observation_ph) super(StochasticNNPolicy, self).__init__( env_spec, self._observation_ph, self._actions, self._name)
def __init__(self, env_spec, max_replay_buffer_size): super(SimpleReplayBuffer, self).__init__() Serializable.quick_init(self, locals()) max_replay_buffer_size = int(max_replay_buffer_size) self._env_spec = env_spec self._observation_dim = flat_dim(env_spec.observation_space) self._action_dim = flat_dim(env_spec.action_space) self._max_buffer_size = max_replay_buffer_size self._observations = np.zeros( (max_replay_buffer_size, self._observation_dim)) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = np.zeros( (max_replay_buffer_size, self._observation_dim)) self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) self._rewards = np.zeros(max_replay_buffer_size) # self._terminals[i] = a terminal was received at time i self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') self._top = 0 self._size = 0
def __init__(self, env_spec, max_replay_buffer_size): super(SimpleReplayBuffer, self).__init__() Serializable.quick_init(self, locals()) max_replay_buffer_size = int(max_replay_buffer_size) self._env_spec = env_spec self._observation_dim = flat_dim(env_spec.observation_space) self._action_dim = flat_dim(env_spec.action_space) self._max_buffer_size = max_replay_buffer_size self._observations = np.zeros((max_replay_buffer_size, self._observation_dim)) # It's a bit memory inefficient to save the observations twice, # but it makes the code *much* easier since you no longer have to # worry about termination conditions. self._next_obs = np.zeros((max_replay_buffer_size, self._observation_dim)) self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) self._rewards = np.zeros(max_replay_buffer_size) # self._terminals[i] = a terminal was received at time i self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') self._top = 0 self._size = 0
def __init__(self, env_spec, hidden_layer_sizes, squash=True, name='policy'): Serializable.quick_init(self, locals()) self._action_dim = flat_dim(env_spec.action_space) self._observation_dim = flat_dim(env_spec.observation_space) self._layer_sizes = list(hidden_layer_sizes) + [self._action_dim] self._squash = squash self._name = name self._observation_ph = tf.placeholder( tf.float32, shape=[None, self._observation_dim], name='observation') self._actions = self.actions_for(self._observation_ph) super(StochasticNNPolicy, self).__init__(env_spec, self._observation_ph, self._actions, self._name)
def test_unflatten(): env = normalize(gym.make('Blackjack-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=False) for i in range(10): env.reset() for e in range(100): action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert flatten(env.observation_space, next_obs).shape == flat_dim(env.observation_space) if done: break env.close()
def test_flatten(): env = normalize(gym.make('Pendulum-v0'), normalize_reward=True, normalize_obs=True, flatten_obs=True) for i in range(10): env.reset() for e in range(100): env.render() action = env.action_space.sample() next_obs, reward, done, info = env.step(action) assert next_obs.shape == flat_dim(env.observation_space) if done: break env.close()
def _set_sensor_mask(self, env, sensor_idx): obsdim = flat_dim(env.observation_space) if len(sensor_idx) > obsdim: raise ValueError( ("Length of sensor mask ({0}) cannot be greater " "than observation dim ({1})").format(len(sensor_idx), obsdim)) if len(sensor_idx) == obsdim and not np.any(np.array(sensor_idx) > 1): sensor_mask = np.array(sensor_idx, dtype=np.bool) elif np.any(np.unique(sensor_idx, return_counts=True)[1] > 1): raise ValueError(("Double entries or boolean mask " "with dim ({0}) < observation dim ({1})").format( len(sensor_idx), obsdim)) else: sensor_mask = np.zeros((obsdim, ), dtype=np.bool) sensor_mask[sensor_idx] = 1 self._sensor_mask = sensor_mask
def __init__( self, base_kwargs, env, pool, qf, policy, plotter=None, policy_lr=1E-3, qf_lr=1E-3, value_n_particles=16, td_target_update_interval=1, kernel_fn=adaptive_isotropic_gaussian_kernel, kernel_n_particles=16, kernel_update_ratio=0.5, discount=0.99, reward_scale=1, use_saved_qf=False, use_saved_policy=False, save_full_state=False, train_qf=True, train_policy=True, ): """ Args: base_kwargs (dict): Dictionary of base arguments that are directly passed to the base `RLAlgorithm` constructor. env (`rllab.Env`): rllab environment object. pool (`PoolBase`): Replay buffer to add gathered samples to. qf (`NNQFunction`): Q-function approximator. policy: (`rllab.NNPolicy`): A policy function approximator. plotter (`QFPolicyPlotter`): Plotter instance to be used for visualizing Q-function during training. qf_lr (`float`): Learning rate used for the Q-function approximator. value_n_particles (`int`): The number of action samples used for estimating the value of next state. td_target_update_interval (`int`): How often the target network is updated to match the current Q-function. kernel_fn (function object): A function object that represents a kernel function. kernel_n_particles (`int`): Total number of particles per state used in SVGD updates. kernel_update_ratio ('float'): The ratio of SVGD particles used for the computation of the inner/outer empirical expectation. discount ('float'): Discount factor. reward_scale ('float'): A factor that scales the raw rewards. Useful for adjusting the temperature of the optimal Boltzmann distribution. use_saved_qf ('boolean'): If true, use the initial parameters provided in the Q-function instead of reinitializing. use_saved_policy ('boolean'): If true, use the initial parameters provided in the policy instead of reinitializing. save_full_state ('boolean'): If true, saves the full algorithm state, including the replay buffer. """ super(SQL, self).__init__(**base_kwargs) self.env = env self.pool = pool self.qf = qf self.policy = policy self.plotter = plotter self._qf_lr = qf_lr self._policy_lr = policy_lr self._discount = discount self._reward_scale = reward_scale self._value_n_particles = value_n_particles self._qf_target_update_interval = td_target_update_interval self._kernel_fn = kernel_fn self._kernel_n_particles = kernel_n_particles self._kernel_update_ratio = kernel_update_ratio self._save_full_state = save_full_state self._train_qf = train_qf self._train_policy = train_policy self._observation_dim = flat_dim(self.env.observation_space) self._action_dim = flat_dim(self.env.action_space) self._create_placeholders() self._training_ops = [] self._target_ops = [] self._create_td_update() self._create_svgd_update() self._create_target_ops() if use_saved_qf: saved_qf_params = qf.get_param_values() if use_saved_policy: saved_policy_params = policy.get_param_values() self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer()) if use_saved_qf: self.qf.set_param_values(saved_qf_params) if use_saved_policy: self.policy.set_param_values(saved_policy_params)
def _build_net(self, reuse=None, custom_getter=None, trainable=None): """ Set up q network based on class attributes. This function uses layers defined in rllab.tf. Args: reuse: A bool indicates whether reuse variables in the same scope. custom_getter: A customized getter object used to get variables. trainable: A bool indicates whether variables are trainable. """ with tf.variable_scope( self.name, reuse=reuse, custom_getter=custom_getter): l_obs = L.InputLayer( shape=(None, flat_dim(self._env_spec.observation_space)), name="obs") l_action = L.InputLayer( shape=(None, flat_dim(self._env_spec.action_space)), name="actions") n_layers = len(self._hidden_sizes) + 1 if n_layers > 1: action_merge_layer = \ (self._action_merge_layer % n_layers + n_layers) % n_layers else: action_merge_layer = 1 l_hidden = l_obs for idx, size in enumerate(self._hidden_sizes): if self._batch_norm: l_hidden = batch_norm(l_hidden) if idx == action_merge_layer: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_hidden = L.DenseLayer( l_hidden, num_units=size, nonlinearity=self._hidden_nonlinearity, trainable=trainable, name="hidden_%d" % (idx + 1)) if action_merge_layer == n_layers: l_hidden = L.ConcatLayer([l_hidden, l_action]) l_output = L.DenseLayer( l_hidden, num_units=1, nonlinearity=self._output_nonlinearity, trainable=trainable, name="output") output_var = L.get_output(l_output) self._f_qval = tensor_utils.compile_function( [l_obs.input_var, l_action.input_var], output_var) self._output_layer = l_output self._obs_layer = l_obs self._action_layer = l_action LayersPowered.__init__(self, [l_output])
def __init__(self, env, actor, critic, n_epochs=500, n_epoch_cycles=20, n_rollout_steps=100, n_train_steps=50, reward_scale=1., batch_size=64, target_update_tau=0.01, discount=0.99, actor_lr=1e-4, critic_lr=1e-3, actor_weight_decay=0, critic_weight_decay=0, replay_buffer_size=int(1e6), min_buffer_size=10000, exploration_strategy=None, plot=False, pause_for_plot=False, actor_optimizer=None, critic_optimizer=None, name=None): """ Construct class. Args: env(): Environment. actor(garage.tf.policies.ContinuousMLPPolicy): Policy network. critic(garage.tf.q_functions.ContinuousMLPQFunction): Q Value network. n_epochs(int, optional): Number of epochs. n_epoch_cycles(int, optional): Number of epoch cycles. n_rollout_steps(int, optional): Number of rollout steps. n_train_steps(int, optional): Number of train steps. reward_scale(float): The scaling factor applied to the rewards when training. batch_size(int): Number of samples for each minibatch. target_update_tau(float): Interpolation parameter for doing the soft target update. discount(float): Discount factor for the cumulative return. actor_lr(float): Learning rate for training policy network. critic_lr(float): Learning rate for training q value network. actor_weight_decay(float): L2 weight decay factor for parameters of the policy network. critic_weight_decay(float): L2 weight decay factor for parameters of the q value network. replay_buffer_size(int): Size of the replay buffer. min_buffer_size(int): Minimum size of the replay buffer to start training. exploration_strategy(): Exploration strategy. plot(bool): Whether to visualize the policy performance after each eval_interval. pause_for_plot(bool): Whether to pause before continuing when plotting. actor_optimizer(): Optimizer for training policy network. critic_optimizer(): Optimizer for training q function network. """ self.env = env self.observation_dim = flat_dim(env.observation_space) self.action_dim = flat_dim(env.action_space) _, self.action_bound = bounds(env.action_space) self.actor = actor self.critic = critic self.n_epochs = n_epochs self.n_epoch_cycles = n_epoch_cycles self.n_rollout_steps = n_rollout_steps self.n_train_steps = n_train_steps self.reward_scale = reward_scale self.batch_size = batch_size self.tau = target_update_tau self.discount = discount self.actor_lr = actor_lr self.critic_lr = critic_lr self.actor_weight_decay = actor_weight_decay self.critic_weight_decay = critic_weight_decay self.replay_buffer_size = replay_buffer_size self.min_buffer_size = min_buffer_size self.es = exploration_strategy self.plot = plot self.pause_for_plot = pause_for_plot self.actor_optimizer = actor_optimizer self.critic_optimizer = critic_optimizer self.name = name self._initialize()