def __init__( self, input_ph: tf.Tensor, name_scope: str, net_name: str, reuse, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): self.input_ph = input_ph self.name_scope = name_scope self.mlp_config = mlp_config self.mlp_net_name = net_name self.net, self.output, self.var_list = MLPCreator.create_network_with_tf_layers( input=input_ph, reuse=reuse, network_config=mlp_config, tf_var_scope=name_scope, net_name=net_name, input_norm=input_norm, output_high=output_high, output_low=output_low, output_norm=output_norm) for var in self.var_list: assert name_scope in var.name self._parameters = ParametersWithTensorflowVariable( tf_var_list=self.var_list, name='parameters_{}'.format(self.mlp_net_name), rest_parameters=dict())
class MLP(object): @typechecked def __init__( self, input_ph: tf.Tensor, name_scope: str, net_name: str, reuse, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): self.input_ph = input_ph self.name_scope = name_scope self.mlp_config = mlp_config self.mlp_net_name = net_name self.net, self.output, self.var_list = MLPCreator.create_network_with_tf_layers( input=input_ph, reuse=reuse, network_config=mlp_config, tf_var_scope=name_scope, net_name=net_name, input_norm=input_norm, output_high=output_high, output_low=output_low, output_norm=output_norm) for var in self.var_list: assert name_scope in var.name self._parameters = ParametersWithTensorflowVariable( tf_var_list=self.var_list, name='parameters_{}'.format(self.mlp_net_name), rest_parameters=dict()) def forward( self, input: np.ndarray, sess=tf.get_default_session()) -> np.ndarray: feed_dict = { self.input_ph: input, **self._parameters.return_tf_parameter_feed_dict() } res = sess.run(self.output, feed_dict=feed_dict) return np.squeeze(res) def copy_from(self, obj) -> bool: if not isinstance(obj, type(self)): raise TypeError( 'Wrong type of obj %s to be copied, which should be %s' % (type(obj), type(self))) self._parameters.copy_from(source_parameter=obj._parameters) return True def init(self, source_obj=None): self._parameters.init() if source_obj: self.copy_from(obj=source_obj)
def __init__(self, env_spec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, schedule_param_list=None, name: str = 'dqn', replay_buffer=None): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) self.q_value_func = value_func self.state_input = self.q_value_func.state_input self.action_input = self.q_value_func.action_input self.update_target_q_every_train = self.config('UPDATE_TARGET_Q_FREQUENCY') if 'UPDATE_TARGET_Q_FREQUENCY' in \ self.config.config_dict else 1 self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='{}_param'.format(name), source_config=self.config, require_snapshot=False) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.target_q_value_func = self.q_value_func.make_copy(name_scope='{}_target_q_value_net'.format(name), name='{}_target_q_value_net'.format(name), reuse=False) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input self.td_error = self.predict_q_value - self.q_value_func.q_tensor with tf.variable_scope('train'): self.q_value_func_loss, self.optimizer, self.update_q_value_func_op = self._set_up_loss() self.update_target_q_value_func_op = self._set_up_target_update() # redundant sort operation on var_list var_list = get_tf_collection_var_list(key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name)) + self.optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.q_value_func, attr_name='q_value_func'), dict(obj=self.target_q_value_func, attr_name='target_q_value_func')], parameters=self.parameters)
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False): DeterministicPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim print(mlp_config[-1]['N_UNITS']) print(action_dim) assert action_dim == mlp_config[-1]['N_UNITS'] with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) mlp_net = MLP(input_ph=state_input, **mlp_kwargs, net_name='deterministic_mlp_policy') PlaceholderInput.__init__(self, parameters=None) self.parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='deterministic_mlp_policy_tf_param') self.state_input = state_input self.mlp_net = mlp_net self.action_tensor = mlp_net.output self.mlp_config = mlp_config self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope
def __init__(self, env_spec: EnvSpec, batch_data, init_state=None, name_scope='gp_dynamics_model', name='gp_dynamics_model', gp_kernel_type='RBF'): if gp_kernel_type not in self.kernel_type_dict.keys(): raise TypeError('Not supported {} kernel, choose from'.format( gp_kernel_type, list(self.kernel_type_dict.keys()))) parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict(), name='{}_param'.format(name), require_snapshot=False) super().__init__(env_spec, parameters, init_state, name) self.name_scope = name_scope state_action_data = np.hstack( (batch_data.state_set, batch_data.action_set)) delta_state_data = batch_data.new_state_set - batch_data.state_set with tf.variable_scope(self.name_scope): self.mgpr_model = MGPR(name='mgpr', action_dim=env_spec.flat_action_dim, x=state_action_data, y=delta_state_data, state_dim=env_spec.flat_obs_dim) var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name_scope) self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name))
def create_ph(self, name): with tf.variable_scope(name): a = tf.get_variable(shape=[3, 4], dtype=tf.float32, name='var_1') conf = DictConfig(required_key_dict=Foo.required_key_dict, config_dict=dict(var1=1, var2=0.01)) param = ParametersWithTensorflowVariable( tf_var_list=[a], rest_parameters=dict(var3='sss'), name=name, source_config=conf, require_snapshot=True, to_ph_parameter_dict=dict( var1=tf.placeholder(shape=(), dtype=tf.int32))) param.init() a = PlaceholderInput(parameters=param) return a, locals()
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, state_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.variable_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') mlp_input_ph = state_input mlp_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope ) mlp_net = MLP(input_ph=mlp_input_ph, net_name='mlp', **mlp_kwargs) parameters = ParametersWithTensorflowVariable(tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='mlp_v_value_function_tf_param') VValueFunction.__init__(self, env_spec=env_spec, state_input=state_input, name=name, parameters=None) PlaceholderInput.__init__(self, inputs=mlp_input_ph, parameters=parameters) self.name_scope = name_scope self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.v_tensor = self.mlp_net.output
def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters)
class DeterministicMLPPolicy(DeterministicPolicy, PlaceholderInput): def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False): DeterministicPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim print(mlp_config[-1]['N_UNITS']) print(action_dim) assert action_dim == mlp_config[-1]['N_UNITS'] with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) mlp_net = MLP(input_ph=state_input, **mlp_kwargs, net_name='deterministic_mlp_policy') PlaceholderInput.__init__(self, parameters=None) self.parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=mlp_kwargs, name='deterministic_mlp_policy_tf_param') self.state_input = state_input self.mlp_net = mlp_net self.action_tensor = mlp_net.output self.mlp_config = mlp_config self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope def forward(self, obs: (np.ndarray, list), sess=None, feed_dict=None, **kwargs): obs = make_batch(obs, original_shape=self.env_spec.obs_shape) feed_dict = {} if feed_dict is None else feed_dict feed_dict = { **feed_dict, self.state_input: obs, **self.parameters.return_tf_parameter_feed_dict() } sess = sess if sess else tf.get_default_session() res = sess.run(self.action_tensor, feed_dict=feed_dict) res = np.clip(res, a_min=self.env_spec.action_space.low, a_max=self.env_spec.action_space.high) return res def copy_from(self, obj) -> bool: return PlaceholderInput.copy_from(self, obj) def make_copy(self, *args, **kwargs): kwargs = _get_copy_arg_with_tf_reuse(obj=self, kwargs=kwargs) copy_mlp_policy = DeterministicMLPPolicy(env_spec=self.env_spec, input_norm=self.input_norm, output_norm=self.output_norm, output_low=self.output_low, output_high=self.output_high, mlp_config=self.mlp_config, **kwargs) return copy_mlp_policy def save(self, *args, **kwargs): return PlaceholderInput.save(self, *args, **kwargs) def load(self, *args, **kwargs): return PlaceholderInput.load(self, *args, **kwargs)
class DDPG(ModelFreeAlgo, OffPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json(file_path=GlobalConfig().DEFAULT_DDPG_REQUIRED_KEY_LIST) @typechecked def __init__(self, env_spec: EnvSpec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, policy: DeterministicMLPPolicy, schedule_param_list=None, name='ddpg', replay_buffer=None): """ :param env_spec: environment specifications, like action apace or observation space :param config_or_config_dict: configuraion dictionary, like learning rate or decay, if any :param value_func: value function :param policy: agent policy :param schedule_param_list: schedule parameter list, if any initla final function to schedule learning process :param name: name of algorithm class instance :param replay_buffer: replay buffer, if any """ ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) config = construct_dict_config(config_or_config_dict, self) self.config = config self.actor = policy self.target_actor = self.actor.make_copy(name_scope='{}_target_actor'.format(self.name), name='{}_target_actor'.format(self.name), reuse=False) self.critic = value_func self.target_critic = self.critic.make_copy(name_scope='{}_target_critic'.format(self.name), name='{}_target_critic'.format(self.name), reuse=False) self.state_input = self.actor.state_input if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) """ self.parameters contains all the parameters (variables) of the algorithm """ self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='ddpg_param', source_config=config, require_snapshot=False) self._critic_with_actor_output = self.critic.make_copy(reuse=True, name='actor_input_{}'.format(self.critic.name), state_input=self.state_input, action_input=self.actor.action_tensor) self._target_critic_with_target_actor_output = self.target_critic.make_copy(reuse=True, name='target_critic_with_target_actor_output_{}'.format( self.critic.name), action_input=self.target_actor.action_tensor) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input with tf.variable_scope('train'): self.critic_loss, self.critic_update_op, self.target_critic_update_op, self.critic_optimizer, \ self.critic_grads = self._setup_critic_loss() self.actor_loss, self.actor_update_op, self.target_actor_update_op, self.action_optimizer, \ self.actor_grads = self._set_up_actor_loss() var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.critic_optimizer.variables() + self.action_optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.target_actor, attr_name='target_actor', ), dict(obj=self.actor, attr_name='actor'), dict(obj=self.critic, attr_name='critic'), dict(obj=self.target_critic, attr_name='target_critic') ], parameters=self.parameters) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): self.actor.init() self.critic.init() self.target_actor.init() self.target_critic.init(source_obj=self.critic) self.parameters.init() if source_obj: self.copy_from(source_obj) super().init() @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train', under_status='TRAIN') def train(self, batch_data=None, train_iter=None, sess=None, update_target=True) -> dict: super(DDPG, self).train() if isinstance(batch_data, TrajectoryData): batch_data = batch_data.return_as_transition_data(shuffle_flag=True) tf_sess = sess if sess else tf.get_default_session() train_iter = self.parameters("TRAIN_ITERATION") if not train_iter else train_iter average_critic_loss = 0.0 average_actor_loss = 0.0 for i in range(train_iter): train_batch = self.replay_buffer.sample( batch_size=self.parameters('BATCH_SIZE')) if batch_data is None else batch_data assert isinstance(train_batch, TransitionData) critic_loss, _ = self._critic_train(train_batch, tf_sess) actor_loss, _ = self._actor_train(train_batch, tf_sess) average_actor_loss += actor_loss average_critic_loss += critic_loss if update_target: tf_sess.run([self.target_actor_update_op, self.target_critic_update_op]) return dict(average_actor_loss=average_actor_loss / train_iter, average_critic_loss=average_critic_loss / train_iter) def _critic_train(self, batch_data, sess) -> (): target_q = sess.run( self._target_critic_with_target_actor_output.q_tensor, feed_dict={ self._target_critic_with_target_actor_output.state_input: batch_data.new_state_set, self.target_actor.state_input: batch_data.new_state_set } ) loss, _, grads = sess.run( [self.critic_loss, self.critic_update_op, self.critic_grads ], feed_dict={ self.target_q_input: target_q, self.critic.state_input: batch_data.state_set, self.critic.action_input: batch_data.action_set, self.done_input: np.reshape(batch_data.done_set, [-1, 1]), self.reward_input: np.reshape(batch_data.reward_set, [-1, 1]), **self.parameters.return_tf_parameter_feed_dict() } ) return loss, grads def _actor_train(self, batch_data, sess) -> (): target_q, loss, _, grads = sess.run( [self._critic_with_actor_output.q_tensor, self.actor_loss, self.actor_update_op, self.actor_grads], feed_dict={ self.actor.state_input: batch_data.state_set, self._critic_with_actor_output.state_input: batch_data.state_set, **self.parameters.return_tf_parameter_feed_dict() } ) return loss, grads @register_counter_info_to_status_decorator(increment=1, info_key='test', under_status='TEST') def test(self, *arg, **kwargs) -> dict: return super().test(*arg, **kwargs) def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): tf_sess = sess if sess else tf.get_default_session() feed_dict = { self.state_input: make_batch(obs, original_shape=self.env_spec.obs_shape), **self.parameters.return_tf_parameter_feed_dict() } return self.actor.forward(obs=obs, sess=tf_sess, feed_dict=feed_dict) def append_to_memory(self, samples: TransitionData): self.replay_buffer.append_batch(obs0=samples.state_set, obs1=samples.new_state_set, action=samples.action_set, reward=samples.reward_set, terminal1=samples.done_set) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig().DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _setup_critic_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.critic.name_scope) loss = tf.reduce_sum((self.predict_q_value - self.critic.q_tensor) ** 2) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('CRITIC_LEARNING_RATE')) grad_var_pair = optimizer.compute_gradients(loss=loss, var_list=self.critic.parameters('tf_var_list')) grads = [g[0] for g in grad_var_pair] if self.parameters('critic_clip_norm') is not None: grad_var_pair, grads = clip_grad(optimizer=optimizer, loss=loss, var_list=self.critic.parameters('tf_var_list'), clip_norm=self.parameters('critic_clip_norm')) optimize_op = optimizer.apply_gradients(grad_var_pair) op = [] for var, target_var in zip(self.critic.parameters('tf_var_list'), self.target_critic.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return loss, optimize_op, op, optimizer, grads def _set_up_actor_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.actor.name_scope) loss = -tf.reduce_mean(self._critic_with_actor_output.q_tensor) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('CRITIC_LEARNING_RATE')) grad_var_pair = optimizer.compute_gradients(loss=loss, var_list=self.actor.parameters('tf_var_list')) grads = [g[0] for g in grad_var_pair] if self.parameters('actor_clip_norm') is not None: grad_var_pair, grads = clip_grad(optimizer=optimizer, loss=loss, var_list=self.actor.parameters('tf_var_list'), clip_norm=self.parameters('critic_clip_norm')) optimize_op = optimizer.apply_gradients(grad_var_pair) op = [] for var, target_var in zip(self.actor.parameters('tf_var_list'), self.target_actor.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return loss, optimize_op, op, optimizer, grads
class PPO(ModelFreeAlgo, OnPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json( file_path=GlobalConfig().DEFAULT_PPO_REQUIRED_KEY_LIST) @typechecked def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters) def warm_up(self, trajectory_data: TrajectoryData): for traj in trajectory_data.trajectories: self.scaler.update_scaler(data=traj.state_set) if self.use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): self.policy.init() self.value_func.init() self.parameters.init() if source_obj: self.copy_from(source_obj) super().init() @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train', under_status='TRAIN') def train(self, trajectory_data: TrajectoryData = None, train_iter=None, sess=None) -> dict: super(PPO, self).train() if trajectory_data is None: trajectory_data = self.trajectory_memory if len(trajectory_data) == 0: raise MemoryBufferLessThanBatchSizeError( 'not enough trajectory data') for i, traj in enumerate(trajectory_data.trajectories): trajectory_data.trajectories[i].append_new_set( name='state_set', shape=self.env_spec.obs_shape, data_set=np.reshape( np.array(self.scaler.process(np.array(traj.state_set))), [-1] + list(self.env_spec.obs_shape))) trajectory_data.trajectories[i].append_new_set( name='new_state_set', shape=self.env_spec.obs_shape, data_set=np.reshape( np.array(self.scaler.process(np.array( traj.new_state_set))), [-1] + list(self.env_spec.obs_shape))) tf_sess = sess if sess else tf.get_default_session() SampleProcessor.add_estimated_v_value(trajectory_data, value_func=self.value_func) SampleProcessor.add_discount_sum_reward(trajectory_data, gamma=self.parameters('gamma')) SampleProcessor.add_gae(trajectory_data, gamma=self.parameters('gamma'), name='advantage_set', lam=self.parameters('lam'), value_func=self.value_func) trajectory_data = SampleProcessor.normalization(trajectory_data, key='advantage_set') policy_res_dict = self._update_policy( state_set=np.concatenate( [t('state_set') for t in trajectory_data.trajectories], axis=0), action_set=np.concatenate( [t('action_set') for t in trajectory_data.trajectories], axis=0), advantage_set=np.concatenate( [t('advantage_set') for t in trajectory_data.trajectories], axis=0), train_iter=train_iter if train_iter else self.parameters('policy_train_iter'), sess=tf_sess) value_func_res_dict = self._update_value_func( state_set=np.concatenate( [t('state_set') for t in trajectory_data.trajectories], axis=0), discount_set=np.concatenate( [t('discount_set') for t in trajectory_data.trajectories], axis=0), train_iter=train_iter if train_iter else self.parameters('value_func_train_iter'), sess=tf_sess) trajectory_data.reset() self.trajectory_memory.reset() return {**policy_res_dict, **value_func_res_dict} @register_counter_info_to_status_decorator(increment=1, info_key='test', under_status='TEST') def test(self, *arg, **kwargs) -> dict: return super().test(*arg, **kwargs) @register_counter_info_to_status_decorator(increment=1, info_key='predict') def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): tf_sess = sess if sess else tf.get_default_session() ac = self.policy.forward( obs=self.scaler.process( data=make_batch(obs, original_shape=self.env_spec.obs_shape)), sess=tf_sess, feed_dict=self.parameters.return_tf_parameter_feed_dict()) return ac def append_to_memory(self, samples: TrajectoryData): # todo how to make sure the data's time sequential obs_list = samples.trajectories[0].state_set for i in range(1, len(samples.trajectories)): obs_list = np.array( np.concatenate([obs_list, samples.trajectories[i].state_set], axis=0)) self.trajectory_memory.union(samples) self.scaler.update_scaler(data=np.array(obs_list)) if self.use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig( ).DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _setup_policy_loss(self): """ Code clip from pat-cody Three loss terms: 1) standard policy gradient 2) D_KL(pi_old || pi_new) 3) Hinge loss on [D_KL - kl_targ]^2 See: https://arxiv.org/pdf/1707.02286.pdf """ if self.parameters('clipping_range') is not None: pg_ratio = tf.exp(self.policy.log_prob() - self.old_policy.log_prob()) clipped_pg_ratio = tf.clip_by_value( pg_ratio, 1 - self.parameters('clipping_range')[0], 1 + self.parameters('clipping_range')[1]) surrogate_loss = tf.minimum(self.advantages_ph * pg_ratio, self.advantages_ph * clipped_pg_ratio) loss = -tf.reduce_mean(surrogate_loss) else: loss1 = -tf.reduce_mean( self.advantages_ph * tf.exp(self.policy.log_prob() - self.old_policy.log_prob())) loss2 = tf.reduce_mean(self.parameters('beta') * self.kl) loss3 = self.parameters('eta') * tf.square( tf.maximum(0.0, self.kl - 2.0 * self.parameters('kl_target'))) loss = loss1 + loss2 + loss3 self.loss1 = loss1 self.loss2 = loss2 self.loss3 = loss3 if isinstance(self.policy, PlaceholderInput): reg_list = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.policy.name_scope) if len(reg_list) > 0: reg_loss = tf.reduce_sum(reg_list) loss += reg_loss optimizer = tf.train.AdamOptimizer( learning_rate=self.parameters('policy_lr') * self.parameters('lr_multiplier')) train_op = optimizer.minimize( loss, var_list=self.policy.parameters('tf_var_list')) return loss, optimizer, train_op def _setup_value_func_loss(self): # todo update the value_func design loss = tf.reduce_mean( tf.square( tf.squeeze(self.value_func.v_tensor) - self.v_func_val_ph)) reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.value_func.name_scope) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(self.parameters('value_func_lr')) train_op = optimizer.minimize( loss, var_list=self.value_func.parameters('tf_var_list')) return loss, optimizer, train_op def _update_policy(self, state_set, action_set, advantage_set, train_iter, sess): old_policy_feed_dict = dict() res = sess.run( [ getattr(self.policy, tensor[1]) for tensor in self.old_dist_tensor ], feed_dict={ self.policy.parameters('state_input'): state_set, self.policy.parameters('action_input'): action_set, **self.parameters.return_tf_parameter_feed_dict() }) for tensor, val in zip(self.old_dist_tensor, res): old_policy_feed_dict[tensor[0]] = val feed_dict = { self.policy.parameters('action_input'): action_set, self.old_policy.parameters('action_input'): action_set, self.policy.parameters('state_input'): state_set, self.advantages_ph: advantage_set, **self.parameters.return_tf_parameter_feed_dict(), **old_policy_feed_dict } average_loss, average_kl, average_entropy = 0.0, 0.0, 0.0 total_epoch = 0 kl = None for i in range(train_iter): _ = sess.run(self.policy_update_op, feed_dict=feed_dict) loss, kl, entropy = sess.run( [self.policy_loss, self.kl, self.average_entropy], feed_dict=feed_dict) average_loss += loss average_kl += kl average_entropy += entropy total_epoch = i + 1 if kl > self.parameters('kl_target', require_true_value=True) * 4: # early stopping if D_KL diverges badly break average_loss, average_kl, average_entropy = average_loss / total_epoch, average_kl / total_epoch, average_entropy / total_epoch if kl > self.parameters('kl_target', require_true_value=True ) * 2: # servo beta to reach D_KL target self.parameters.set( key='beta', new_val=np.minimum( 35, 1.5 * self.parameters('beta', require_true_value=True))) if self.parameters( 'beta', require_true_value=True) > 30 and self.parameters( 'lr_multiplier', require_true_value=True) > 0.1: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) / 1.5) elif kl < self.parameters('kl_target', require_true_value=True) / 2: self.parameters.set( key='beta', new_val=np.maximum( 1 / 35, self.parameters('beta', require_true_value=True) / 1.5)) if self.parameters('beta', require_true_value=True) < ( 1 / 30) and self.parameters('lr_multiplier', require_true_value=True) < 10: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) * 1.5) return dict(policy_average_loss=average_loss, policy_average_kl=average_kl, policy_average_entropy=average_entropy, policy_total_train_epoch=total_epoch) def _update_value_func(self, state_set, discount_set, train_iter, sess): y_hat = self.value_func.forward(obs=state_set).squeeze() old_exp_var = 1 - np.var(discount_set - y_hat) / np.var(discount_set) if self.value_func_train_data_buffer is None: self.value_func_train_data_buffer = (state_set, discount_set) else: self.value_func_train_data_buffer = ( np.concatenate( [self.value_func_train_data_buffer[0], state_set], axis=0), np.concatenate( [self.value_func_train_data_buffer[1], discount_set], axis=0)) if len(self.value_func_train_data_buffer[0]) > self.parameters( 'value_func_memory_size'): self.value_func_train_data_buffer = tuple( np.array(data[-self.parameters('value_func_memory_size'):]) for data in self.value_func_train_data_buffer) state_set_all, discount_set_all = self.value_func_train_data_buffer param_dict = self.parameters.return_tf_parameter_feed_dict() for i in range(train_iter): random_index = np.random.choice(np.arange(len(state_set_all)), len(state_set_all)) state_set_all = state_set_all[random_index] discount_set_all = discount_set_all[random_index] for index in range( 0, len(state_set_all) - self.parameters('value_func_train_batch_size'), self.parameters('value_func_train_batch_size')): state = np.array( state_set_all[index:index + self. parameters('value_func_train_batch_size')]) discount = discount_set_all[ index:index + self.parameters('value_func_train_batch_size')] loss, _ = sess.run( [self.value_func_loss, self.value_func_update_op], options=tf.RunOptions( report_tensor_allocations_upon_oom=True), feed_dict={ self.value_func.state_input: state, self.v_func_val_ph: discount, **param_dict }) y_hat = self.value_func.forward(obs=state_set).squeeze() loss = np.mean(np.square(y_hat - discount_set)) exp_var = 1 - np.var(discount_set - y_hat) / np.var(discount_set) return dict(value_func_loss=loss, value_func_policy_exp_var=exp_var, value_func_policy_old_exp_var=old_exp_var)
def __init__(self, env_spec: EnvSpec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, policy: DeterministicMLPPolicy, schedule_param_list=None, name='ddpg', replay_buffer=None): """ :param env_spec: environment specifications, like action apace or observation space :param config_or_config_dict: configuraion dictionary, like learning rate or decay, if any :param value_func: value function :param policy: agent policy :param schedule_param_list: schedule parameter list, if any initla final function to schedule learning process :param name: name of algorithm class instance :param replay_buffer: replay buffer, if any """ ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) config = construct_dict_config(config_or_config_dict, self) self.config = config self.actor = policy self.target_actor = self.actor.make_copy(name_scope='{}_target_actor'.format(self.name), name='{}_target_actor'.format(self.name), reuse=False) self.critic = value_func self.target_critic = self.critic.make_copy(name_scope='{}_target_critic'.format(self.name), name='{}_target_critic'.format(self.name), reuse=False) self.state_input = self.actor.state_input if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) """ self.parameters contains all the parameters (variables) of the algorithm """ self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='ddpg_param', source_config=config, require_snapshot=False) self._critic_with_actor_output = self.critic.make_copy(reuse=True, name='actor_input_{}'.format(self.critic.name), state_input=self.state_input, action_input=self.actor.action_tensor) self._target_critic_with_target_actor_output = self.target_critic.make_copy(reuse=True, name='target_critic_with_target_actor_output_{}'.format( self.critic.name), action_input=self.target_actor.action_tensor) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input with tf.variable_scope('train'): self.critic_loss, self.critic_update_op, self.target_critic_update_op, self.critic_optimizer, \ self.critic_grads = self._setup_critic_loss() self.actor_loss, self.actor_update_op, self.target_actor_update_op, self.action_optimizer, \ self.actor_grads = self._set_up_actor_loss() var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.critic_optimizer.variables() + self.action_optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.target_actor, attr_name='target_actor', ), dict(obj=self.actor, attr_name='actor'), dict(obj=self.critic, attr_name='critic'), dict(obj=self.target_critic, attr_name='target_critic') ], parameters=self.parameters)
def __init__( self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, state_input: tf.Tensor = None, action_input: tf.Tensor = None, reuse=False, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): with tf.name_scope(name_scope): state_input = state_input if state_input is not None else tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = action_input if action_input is not None else tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') with tf.variable_scope(name_scope): mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') mlp_net_kwargs = dict( reuse=reuse, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, output_high=output_high, output_low=output_low, name_scope=name_scope, ) mlp_net = MLP(input_ph=mlp_input_ph, net_name=name_scope, **mlp_net_kwargs) parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, rest_parameters=dict(**mlp_net_kwargs, name=name), default_save_type='tf', name='{}_tf_param'.format(name)) QValueFunction.__init__(self, env_spec=env_spec, name=name, action_input=action_input, state_input=state_input, parameters=None) PlaceholderInput.__init__(self, parameters=parameters) self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.name_scope = name_scope self.mlp_input_ph = mlp_input_ph self.mlp_net = mlp_net self.q_tensor = self.mlp_net.output
class NormalDistributionMLPPolicy(StochasticPolicy, PlaceholderInput): def __init__(self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False, distribution_tensors_tuple: tuple = None): StochasticPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim assert action_dim == mlp_config[-1]['N_UNITS'] self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.mlp_config = mlp_config self.name_scope = name_scope mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) ph_inputs = [] if distribution_tensors_tuple is not None: self.mean_output = distribution_tensors_tuple[0][0] self.logvar_output = distribution_tensors_tuple[1][0] assert list(self.mean_output.shape)[-1] == action_dim assert list(self.logvar_output.shape)[-1] == action_dim self.mlp_net = None else: with tf.variable_scope(self.name_scope): self.state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') ph_inputs.append(self.state_input) self.mlp_net = MLP(input_ph=self.state_input, net_name='normal_distribution_mlp_policy', **mlp_kwargs) self.mean_output = self.mlp_net.output with tf.variable_scope(name_scope, reuse=reuse): with tf.variable_scope('norm_dist', reuse=reuse): logvar_speed = (10 * self.mlp_config[-2]['N_UNITS']) // 48 logvar_output = tf.get_variable( name='normal_distribution_variance', shape=[logvar_speed, self.mlp_config[-1]['N_UNITS']], dtype=tf.float32) # self.logvar_output = tf.reduce_sum(logvar_output, axis=0) + self.parameters('log_var_init') self.logvar_output = tf.reduce_sum(logvar_output, axis=0) with tf.variable_scope(name_scope, reuse=reuse): self.action_input = tf.placeholder(shape=[None, action_dim], dtype=tf.float32, name='action_ph') ph_inputs.append(self.action_input) with tf.variable_scope('norm_dist', reuse=reuse): self.stddev_output = tf.exp(self.logvar_output / 2.0, name='std_dev') self.var_output = tf.exp(self.logvar_output, name='variance') self.action_distribution = tfp.distributions.MultivariateNormalDiag( loc=self.mean_output, scale_diag=self.stddev_output, name='mlp_normal_distribution') self.action_output = self.action_distribution.sample() self.dist_info_tensor_op_dict = { # todo support more in future 'prob': self.action_distribution.prob, 'log_prob': self.action_distribution.log_prob, 'entropy': self.action_distribution.entropy, 'kl': self.kl } var_list = get_tf_collection_var_list( scope='{}/norm_dist'.format(name_scope)) if self.mlp_net: var_list += self.mlp_net.var_list self.parameters = ParametersWithTensorflowVariable( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name), rest_parameters=dict(state_input=self.state_input, action_input=self.action_input, **mlp_kwargs), name='normal_distribution_mlp_tf_param') PlaceholderInput.__init__(self, parameters=self.parameters, inputs=tuple(ph_inputs)) @typechecked @overrides.overrides def forward(self, obs: (np.ndarray, list), sess=None, feed_dict=None, **kwargs): obs = make_batch(obs, original_shape=self.env_spec.obs_shape) feed_dict = feed_dict if feed_dict is not None else dict() feed_dict = { **feed_dict, self.state_input: obs, **self.parameters.return_tf_parameter_feed_dict() } sess = sess if sess else tf.get_default_session() res = sess.run(self.action_output, feed_dict=feed_dict) res = np.clip(res, a_min=self.env_spec.action_space.low, a_max=self.env_spec.action_space.high) return res @overrides.overrides def copy_from(self, obj) -> bool: return PlaceholderInput.copy_from(self, obj) def make_copy(self, **kwargs): kwargs = _get_copy_arg_with_tf_reuse(obj=self, kwargs=kwargs) copy_mlp_policy = NormalDistributionMLPPolicy( env_spec=self.env_spec, input_norm=self.input_norm, output_norm=self.output_norm, output_low=self.output_low, output_high=self.output_high, mlp_config=self.mlp_config, **kwargs) return copy_mlp_policy def compute_dist_info(self, name, sess=None, **kwargs) -> np.ndarray: assert name in ['log_prob', 'prob', 'entropy', 'kl'] sess = sess if sess else tf.get_default_session() if name in ['log_prob', 'prob']: if 'value' not in kwargs: raise ValueError( 'To compute {}, pass the parameter with key {} and type {} in' .format(name, 'value', np.ndarray.__name__)) assert isinstance(kwargs['value'], np.ndarray) if name == 'kl': if 'other' not in kwargs: raise ValueError( 'To compute {}, pass the parameter with key {} and type {} in' .format(name, 'other', type(self).__name__)) assert isinstance(kwargs['other'], type(self)) if 'feed_dict' in kwargs: feed_dict = kwargs['feed_dict'] if 'feed_dict' in kwargs else None kwargs.pop('feed_dict') else: feed_dict = None return sess.run(self.dist_info_tensor_op_dict[name](**kwargs), feed_dict=feed_dict) def kl(self, other, *args, **kwargs) -> tf.Tensor: if not isinstance(other.action_distribution, tfp.distributions.Distribution): raise TypeError() return self.action_distribution.kl_divergence( other.action_distribution) def log_prob(self, *args, **kwargs) -> tf.Tensor: return self.dist_info_tensor_op_dict['log_prob']( value=self.action_input) def prob(self, *args, **kwargs) -> tf.Tensor: return self.dist_info_tensor_op_dict['prob'](value=self.action_input) def entropy(self, *args, **kwargs) -> tf.Tensor: return self.dist_info_tensor_op_dict['entropy']() def get_dist_info(self) -> tuple: res = (dict(shape=tuple(self.mean_output.shape.as_list()), name='mean_output', obj=self.mean_output, dtype=self.mean_output.dtype), dict(shape=tuple(self.logvar_output.shape.as_list()), name='logvar_output', obj=self.logvar_output, dtype=self.logvar_output.dtype)) for re in res: attr = getattr(self, re['name']) if id(attr) != id(re['obj']): raise ValueError( 'key name {} should be same as the obj {} name'.format( re['name'], re['obj'])) return res def save(self, *args, **kwargs): return PlaceholderInput.save(self, *args, **kwargs) def load(self, *args, **kwargs): return PlaceholderInput.load(self, *args, **kwargs)
def __init__(self, env_spec: EnvSpec, name: str, name_scope: str, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, reuse=False, distribution_tensors_tuple: tuple = None): StochasticPolicy.__init__(self, env_spec=env_spec, name=name, parameters=None) obs_dim = env_spec.flat_obs_dim action_dim = env_spec.flat_action_dim assert action_dim == mlp_config[-1]['N_UNITS'] self.mlp_config = mlp_config self.input_norm = input_norm self.output_norm = output_norm self.output_low = output_low self.output_high = output_high self.mlp_config = mlp_config self.name_scope = name_scope mlp_kwargs = dict(reuse=reuse, input_norm=input_norm, output_norm=output_norm, output_low=output_low, output_high=output_high, mlp_config=mlp_config, name_scope=name_scope) ph_inputs = [] if distribution_tensors_tuple is not None: self.mean_output = distribution_tensors_tuple[0][0] self.logvar_output = distribution_tensors_tuple[1][0] assert list(self.mean_output.shape)[-1] == action_dim assert list(self.logvar_output.shape)[-1] == action_dim self.mlp_net = None else: with tf.variable_scope(self.name_scope): self.state_input = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32, name='state_ph') ph_inputs.append(self.state_input) self.mlp_net = MLP(input_ph=self.state_input, net_name='normal_distribution_mlp_policy', **mlp_kwargs) self.mean_output = self.mlp_net.output with tf.variable_scope(name_scope, reuse=reuse): with tf.variable_scope('norm_dist', reuse=reuse): logvar_speed = (10 * self.mlp_config[-2]['N_UNITS']) // 48 logvar_output = tf.get_variable( name='normal_distribution_variance', shape=[logvar_speed, self.mlp_config[-1]['N_UNITS']], dtype=tf.float32) # self.logvar_output = tf.reduce_sum(logvar_output, axis=0) + self.parameters('log_var_init') self.logvar_output = tf.reduce_sum(logvar_output, axis=0) with tf.variable_scope(name_scope, reuse=reuse): self.action_input = tf.placeholder(shape=[None, action_dim], dtype=tf.float32, name='action_ph') ph_inputs.append(self.action_input) with tf.variable_scope('norm_dist', reuse=reuse): self.stddev_output = tf.exp(self.logvar_output / 2.0, name='std_dev') self.var_output = tf.exp(self.logvar_output, name='variance') self.action_distribution = tfp.distributions.MultivariateNormalDiag( loc=self.mean_output, scale_diag=self.stddev_output, name='mlp_normal_distribution') self.action_output = self.action_distribution.sample() self.dist_info_tensor_op_dict = { # todo support more in future 'prob': self.action_distribution.prob, 'log_prob': self.action_distribution.log_prob, 'entropy': self.action_distribution.entropy, 'kl': self.kl } var_list = get_tf_collection_var_list( scope='{}/norm_dist'.format(name_scope)) if self.mlp_net: var_list += self.mlp_net.var_list self.parameters = ParametersWithTensorflowVariable( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name), rest_parameters=dict(state_input=self.state_input, action_input=self.action_input, **mlp_kwargs), name='normal_distribution_mlp_tf_param') PlaceholderInput.__init__(self, parameters=self.parameters, inputs=tuple(ph_inputs))
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, learning_rate: float, state_input_scaler: DataScaler = None, action_input_scaler: DataScaler = None, output_delta_state_scaler: DataScaler = None, init_state=None): if not isinstance(env_spec.obs_space, Box): raise TypeError( 'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type' ) GlobalDynamicsModel.__init__(self, env_spec=env_spec, parameters=None, name=name, state_input_scaler=state_input_scaler, action_input_scaler=action_input_scaler, init_state=init_state) with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') delta_state_label_ph = tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='delta_state_label_ph') mlp_net = MLP(input_ph=mlp_input_ph, reuse=False, mlp_config=mlp_config, name_scope=name_scope, net_name='mlp') if mlp_net.output.shape[1] != env_spec.flat_obs_dim: raise InappropriateParameterSetting( "mlp output dims {} != env spec obs dim {}".format( mlp_net.output.shape[1], env_spec.flat_obs_dim)) parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, name=name + '_' 'mlp_continuous_dynamics_model', rest_parameters=dict(learning_rate=learning_rate)) DifferentiableDynamics.__init__( self, input_node_dict=dict(state_input=state_input, action_action_input=action_input), output_node_dict=dict(delta_state_output=mlp_net.output)) PlaceholderInput.__init__(self, parameters=parameters) self.mlp_config = mlp_config self.name_scope = name_scope self.action_input = action_input self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.delta_state_label_ph = delta_state_label_ph self.delta_state_output = mlp_net.output self.mlp_net = mlp_net self.output_delta_state_scaler = output_delta_state_scaler if output_delta_state_scaler else IdenticalDataScaler( dims=self.env_spec.flat_obs_dim) self._status = StatusWithSubInfo(obj=self) with tf.variable_scope(name_scope): with tf.variable_scope('train'): self.loss, self.optimizer, self.optimize_op = self._setup_loss( ) train_var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name_scope)) + self.optimizer.variables() self.parameters.set_tf_var_list( sorted(list(set(train_var_list)), key=lambda x: x.name))
def __init__(self, env_spec: EnvSpec, name_scope: str, name: str, mlp_config: list, learning_rate: float, output_norm: np.ndarray = None, input_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, init_state=None): if not isinstance(env_spec.obs_space, Box): raise TypeError( 'ContinuousMLPGlobalDynamicsModel only support to predict state that hold space Box type' ) GlobalDynamicsModel.__init__(self, env_spec=env_spec, parameters=None, name=name, init_state=init_state) with tf.variable_scope(name_scope): state_input = tf.placeholder(shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='state_ph') action_input = tf.placeholder( shape=[None, env_spec.flat_action_dim], dtype=tf.float32, name='action_ph') mlp_input_ph = tf.concat([state_input, action_input], axis=1, name='state_action_input') delta_state_label_ph = tf.placeholder( shape=[None, env_spec.flat_obs_dim], dtype=tf.float32, name='delta_state_label_ph') mlp_net = MLP( input_ph=mlp_input_ph, reuse=False, mlp_config=mlp_config, input_norm=input_norm, output_norm=output_norm, # todo have a running-up mean module output_high=output_high - output_low, output_low=output_low - output_high, name_scope=name_scope, net_name='mlp') assert mlp_net.output.shape[1] == env_spec.flat_obs_dim parameters = ParametersWithTensorflowVariable( tf_var_list=mlp_net.var_list, name=name + '_' 'mlp_continuous_dynamics_model', rest_parameters=dict(output_low=output_low, output_high=output_high, input_norm=input_norm, learning_rate=learning_rate)) with tf.variable_scope(name_scope): with tf.variable_scope('train'): new_state_output = mlp_net.output + state_input DerivableDynamics.__init__( self, input_node_dict=dict(state_input=state_input, action_action_input=action_input), output_node_dict=dict(new_state_output=new_state_output)) PlaceholderInput.__init__(self, inputs=(state_input, action_input, delta_state_label_ph), parameters=parameters) self.mlp_config = mlp_config self.name_scope = name_scope self.action_input = action_input self.state_input = state_input self.mlp_input_ph = mlp_input_ph self.delta_state_label_ph = delta_state_label_ph self.new_state_output = new_state_output self.mlp_net = mlp_net self._status = StatusWithSubInfo(obj=self) with tf.variable_scope(name_scope): with tf.variable_scope('train'): self.loss, self.optimizer, self.optimize_op = self._setup_loss( ) train_var_list = get_tf_collection_var_list( key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name_scope)) + self.optimizer.variables() self.parameters.set_tf_var_list( sorted(list(set(train_var_list)), key=lambda x: x.name))
class DQN(ModelFreeAlgo, OffPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json(file_path=GlobalConfig().DEFAULT_DQN_REQUIRED_KEY_LIST) @init_func_arg_record_decorator() @typechecked def __init__(self, env_spec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, schedule_param_list=None, name: str = 'dqn', replay_buffer=None): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) self.q_value_func = value_func self.state_input = self.q_value_func.state_input self.action_input = self.q_value_func.action_input self.update_target_q_every_train = self.config('UPDATE_TARGET_Q_FREQUENCY') if 'UPDATE_TARGET_Q_FREQUENCY' in \ self.config.config_dict else 1 self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='{}_param'.format(name), source_config=self.config, require_snapshot=False) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.target_q_value_func = self.q_value_func.make_copy(name_scope='{}_target_q_value_net'.format(name), name='{}_target_q_value_net'.format(name), reuse=False) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input self.td_error = self.predict_q_value - self.q_value_func.q_tensor with tf.variable_scope('train'): self.q_value_func_loss, self.optimizer, self.update_q_value_func_op = self._set_up_loss() self.update_target_q_value_func_op = self._set_up_target_update() # redundant sort operation on var_list var_list = get_tf_collection_var_list(key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name)) + self.optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.q_value_func, attr_name='q_value_func'), dict(obj=self.target_q_value_func, attr_name='target_q_value_func')], parameters=self.parameters) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): super().init() self.q_value_func.init() self.target_q_value_func.init(source_obj=self.q_value_func) self.parameters.init() if source_obj: self.copy_from(source_obj) @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train_counter', under_status='TRAIN') def train(self, batch_data=None, train_iter=None, sess=None, update_target=True) -> dict: super(DQN, self).train() self.recorder.record() if batch_data and not isinstance(batch_data, TransitionData): raise TypeError() tf_sess = sess if sess else tf.get_default_session() train_iter = self.parameters("TRAIN_ITERATION") if not train_iter else train_iter average_loss = 0.0 for i in range(train_iter): if batch_data is None: train_data = self.replay_buffer.sample(batch_size=self.parameters('BATCH_SIZE')) else: train_data = batch_data _, target_q_val_on_new_s = self.predict_target_with_q_val(obs=train_data.new_state_set, batch_flag=True) target_q_val_on_new_s = np.expand_dims(target_q_val_on_new_s, axis=1) assert target_q_val_on_new_s.shape[0] == train_data.state_set.shape[0] feed_dict = { self.reward_input: np.reshape(train_data.reward_set, [-1, 1]), self.action_input: flatten_n(self.env_spec.action_space, train_data.action_set), self.state_input: train_data.state_set, self.done_input: np.reshape(train_data.done_set, [-1, 1]), self.target_q_input: target_q_val_on_new_s, **self.parameters.return_tf_parameter_feed_dict() } res, _ = tf_sess.run([self.q_value_func_loss, self.update_q_value_func_op], feed_dict=feed_dict) average_loss += res average_loss /= train_iter if update_target is True and self.get_status()['train_counter'] % self.update_target_q_every_train == 0: tf_sess.run(self.update_target_q_value_func_op, feed_dict=self.parameters.return_tf_parameter_feed_dict()) return dict(average_loss=average_loss) @register_counter_info_to_status_decorator(increment=1, info_key='test_counter', under_status='TEST') def test(self, *arg, **kwargs): return super().test(*arg, **kwargs) @register_counter_info_to_status_decorator(increment=1, info_key='predict_counter') def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): if batch_flag: action, q_val = self._predict_batch_action(obs=obs, q_value_tensor=self.q_value_func.q_tensor, action_ph=self.action_input, state_ph=self.state_input, sess=sess) else: action, q_val = self._predict_action(obs=obs, q_value_tensor=self.q_value_func.q_tensor, action_ph=self.action_input, state_ph=self.state_input, sess=sess) if not batch_flag: return int(action) else: return action.astype(np.int).tolist() def predict_target_with_q_val(self, obs: np.ndarray, sess=None, batch_flag: bool = False): if batch_flag: action, q_val = self._predict_batch_action(obs=obs, q_value_tensor=self.target_q_value_func.q_tensor, action_ph=self.target_q_value_func.action_input, state_ph=self.target_q_value_func.state_input, sess=sess) else: action, q_val = self._predict_action(obs=obs, q_value_tensor=self.target_q_value_func.q_tensor, action_ph=self.target_q_value_func.action_input, state_ph=self.target_q_value_func.state_input, sess=sess) return action, q_val # Store Transition @register_counter_info_to_status_decorator(increment=1, info_key='append_to_memory') def append_to_memory(self, samples: TransitionData): self.replay_buffer.append_batch(obs0=samples.state_set, obs1=samples.new_state_set, action=samples.action_set, reward=samples.reward_set, terminal1=samples.done_set) self._status.update_info(info_key='replay_buffer_data_total_count', increment=len(samples)) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig().DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _predict_action(self, obs: np.ndarray, q_value_tensor: tf.Tensor, action_ph: tf.Tensor, state_ph: tf.Tensor, sess=None): if self.env_spec.obs_space.contains(obs) is False: raise StateOrActionOutOfBoundError("obs {} out of bound {}".format(obs, self.env_spec.obs_space.bound())) obs = repeat_ndarray(obs, repeats=self.env_spec.flat_action_dim) tf_sess = sess if sess else tf.get_default_session() feed_dict = {action_ph: generate_n_actions_hot_code(n=self.env_spec.flat_action_dim), state_ph: obs, **self.parameters.return_tf_parameter_feed_dict()} res = tf_sess.run([q_value_tensor], feed_dict=feed_dict)[0] return np.argmax(res, axis=0), np.max(res, axis=0) def _predict_batch_action(self, obs: np.ndarray, q_value_tensor: tf.Tensor, action_ph: tf.Tensor, state_ph: tf.Tensor, sess=None): actions = [] q_values = [] for obs_i in obs: action, q_val = self._predict_action(obs=obs_i, q_value_tensor=q_value_tensor, action_ph=action_ph, state_ph=state_ph, sess=sess) actions.append(np.argmax(action, axis=0)) q_values.append(np.max(q_val, axis=0)) return np.array(actions), np.array(q_values) def _set_up_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.q_value_func.name_scope) loss = tf.reduce_sum((self.predict_q_value - self.q_value_func.q_tensor) ** 2) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('LEARNING_RATE')) optimize_op = optimizer.minimize(loss=loss, var_list=self.q_value_func.parameters('tf_var_list')) return loss, optimizer, optimize_op # update target net def _set_up_target_update(self): op = [] for var, target_var in zip(self.q_value_func.parameters('tf_var_list'), self.target_q_value_func.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return op def _evaluate_td_error(self, sess=None): # tf_sess = sess if sess else tf.get_default_session() # feed_dict = { # self.reward_input: train_data.reward_set, # self.action_input: flatten_n(self.env_spec.action_space, train_data.action_set), # self.state_input: train_data.state_set, # self.done_input: train_data.done_set, # self.target_q_input: target_q_val_on_new_s, # **self.parameters.return_tf_parameter_feed_dict() # } # td_loss = tf_sess.run([self.td_error], feed_dict=feed_dict) pass