def __init__(self, env_spec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, schedule_param_list=None, name: str = 'dqn', replay_buffer=None): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) self.q_value_func = value_func self.state_input = self.q_value_func.state_input self.action_input = self.q_value_func.action_input self.update_target_q_every_train = self.config('UPDATE_TARGET_Q_FREQUENCY') if 'UPDATE_TARGET_Q_FREQUENCY' in \ self.config.config_dict else 1 self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='{}_param'.format(name), source_config=self.config, require_snapshot=False) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.target_q_value_func = self.q_value_func.make_copy(name_scope='{}_target_q_value_net'.format(name), name='{}_target_q_value_net'.format(name), reuse=False) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input self.td_error = self.predict_q_value - self.q_value_func.q_tensor with tf.variable_scope('train'): self.q_value_func_loss, self.optimizer, self.update_q_value_func_op = self._set_up_loss() self.update_target_q_value_func_op = self._set_up_target_update() # redundant sort operation on var_list var_list = get_tf_collection_var_list(key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name)) + self.optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.q_value_func, attr_name='q_value_func'), dict(obj=self.target_q_value_func, attr_name='target_q_value_func')], parameters=self.parameters)
def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig( ).DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name)
def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters)
def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name)
def __init__(self, env_spec: EnvSpec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, policy: DeterministicMLPPolicy, schedule_param_list=None, name='ddpg', replay_buffer=None): """ :param env_spec: environment specifications, like action apace or observation space :param config_or_config_dict: configuraion dictionary, like learning rate or decay, if any :param value_func: value function :param policy: agent policy :param schedule_param_list: schedule parameter list, if any initla final function to schedule learning process :param name: name of algorithm class instance :param replay_buffer: replay buffer, if any """ ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) config = construct_dict_config(config_or_config_dict, self) self.config = config self.actor = policy self.target_actor = self.actor.make_copy(name_scope='{}_target_actor'.format(self.name), name='{}_target_actor'.format(self.name), reuse=False) self.critic = value_func self.target_critic = self.critic.make_copy(name_scope='{}_target_critic'.format(self.name), name='{}_target_critic'.format(self.name), reuse=False) self.state_input = self.actor.state_input if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) """ self.parameters contains all the parameters (variables) of the algorithm """ self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='ddpg_param', source_config=config, require_snapshot=False) self._critic_with_actor_output = self.critic.make_copy(reuse=True, name='actor_input_{}'.format(self.critic.name), state_input=self.state_input, action_input=self.actor.action_tensor) self._target_critic_with_target_actor_output = self.target_critic.make_copy(reuse=True, name='target_critic_with_target_actor_output_{}'.format( self.critic.name), action_input=self.target_actor.action_tensor) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input with tf.variable_scope('train'): self.critic_loss, self.critic_update_op, self.target_critic_update_op, self.critic_optimizer, \ self.critic_grads = self._setup_critic_loss() self.actor_loss, self.actor_update_op, self.target_actor_update_op, self.action_optimizer, \ self.actor_grads = self._set_up_actor_loss() var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.critic_optimizer.variables() + self.action_optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.target_actor, attr_name='target_actor', ), dict(obj=self.actor, attr_name='actor'), dict(obj=self.critic, attr_name='critic'), dict(obj=self.target_critic, attr_name='target_critic') ], parameters=self.parameters)