class MLP(object): @typechecked def __init__( self, input_ph: tf.Tensor, name_scope: str, net_name: str, reuse, mlp_config: list, input_norm: np.ndarray = None, output_norm: np.ndarray = None, output_low: np.ndarray = None, output_high: np.ndarray = None, ): self.input_ph = input_ph self.name_scope = name_scope self.mlp_config = mlp_config self.mlp_net_name = net_name self.net, self.output, self.var_list = MLPCreator.create_network_with_tf_layers( input=input_ph, reuse=reuse, network_config=mlp_config, tf_var_scope=name_scope, net_name=net_name, input_norm=input_norm, output_high=output_high, output_low=output_low, output_norm=output_norm) for var in self.var_list: assert name_scope in var.name self._parameters = ParametersWithTensorflowVariable( tf_var_list=self.var_list, name='parameters_{}'.format(self.mlp_net_name), rest_parameters=dict()) def forward( self, input: np.ndarray, sess=tf.get_default_session()) -> np.ndarray: feed_dict = { self.input_ph: input, **self._parameters.return_tf_parameter_feed_dict() } res = sess.run(self.output, feed_dict=feed_dict) return np.squeeze(res) def copy_from(self, obj) -> bool: if not isinstance(obj, type(self)): raise TypeError( 'Wrong type of obj %s to be copied, which should be %s' % (type(obj), type(self))) self._parameters.copy_from(source_parameter=obj._parameters) return True def init(self, source_obj=None): self._parameters.init() if source_obj: self.copy_from(obj=source_obj)
def create_ph(self, name): with tf.variable_scope(name): a = tf.get_variable(shape=[3, 4], dtype=tf.float32, name='var_1') conf = DictConfig(required_key_dict=Foo.required_key_dict, config_dict=dict(var1=1, var2=0.01)) param = ParametersWithTensorflowVariable( tf_var_list=[a], rest_parameters=dict(var3='sss'), name=name, source_config=conf, require_snapshot=True, to_ph_parameter_dict=dict( var1=tf.placeholder(shape=(), dtype=tf.int32))) param.init() a = PlaceholderInput(parameters=param) return a, locals()
class PPO(ModelFreeAlgo, OnPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json( file_path=GlobalConfig().DEFAULT_PPO_REQUIRED_KEY_LIST) @typechecked def __init__(self, env_spec: EnvSpec, stochastic_policy: StochasticPolicy, config_or_config_dict: (DictConfig, dict), value_func: VValueFunction, warm_up_trajectories_number=5, use_time_index_flag=False, name='ppo'): ModelFreeAlgo.__init__( self, env_spec=env_spec, name=name, warm_up_trajectories_number=warm_up_trajectories_number) self.use_time_index_flag = use_time_index_flag self.config = construct_dict_config(config_or_config_dict, self) self.policy = stochastic_policy self.value_func = value_func to_ph_parameter_dict = dict() self.trajectory_memory = TrajectoryData(env_spec=env_spec) self.transition_data_for_trajectory = TransitionData(env_spec=env_spec) self.value_func_train_data_buffer = None self.scaler = RunningStandardScaler(dims=self.env_spec.flat_obs_dim) if use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) with tf.variable_scope(name): self.advantages_ph = tf.placeholder(tf.float32, (None, ), 'advantages') self.v_func_val_ph = tf.placeholder(tf.float32, (None, ), 'val_val_func') dist_info_list = self.policy.get_dist_info() self.old_dist_tensor = [ (tf.placeholder(**dict(dtype=dist_info['dtype'], shape=dist_info['shape'], name=dist_info['name'])), dist_info['name']) for dist_info in dist_info_list ] self.old_policy = self.policy.make_copy( reuse=False, name_scope='old_{}'.format(self.policy.name), name='old_{}'.format(self.policy.name), distribution_tensors_tuple=tuple(self.old_dist_tensor)) to_ph_parameter_dict['beta'] = tf.placeholder( tf.float32, (), 'beta') to_ph_parameter_dict['eta'] = tf.placeholder(tf.float32, (), 'eta') to_ph_parameter_dict['kl_target'] = tf.placeholder( tf.float32, (), 'kl_target') to_ph_parameter_dict['lr_multiplier'] = tf.placeholder( tf.float32, (), 'lr_multiplier') self.parameters = ParametersWithTensorflowVariable( tf_var_list=[], rest_parameters=dict( advantages_ph=self.advantages_ph, v_func_val_ph=self.v_func_val_ph, ), to_ph_parameter_dict=to_ph_parameter_dict, name='ppo_param', save_rest_param_flag=False, source_config=self.config, require_snapshot=False) with tf.variable_scope(name): with tf.variable_scope('train'): self.kl = tf.reduce_mean(self.old_policy.kl(self.policy)) self.average_entropy = tf.reduce_mean(self.policy.entropy()) self.policy_loss, self.policy_optimizer, self.policy_update_op = self._setup_policy_loss( ) self.value_func_loss, self.value_func_optimizer, self.value_func_update_op = self._setup_value_func_loss( ) var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.policy_optimizer.variables( ) + self.value_func_optimizer.variables() self.parameters.set_tf_var_list( tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[ dict( obj=self.value_func, attr_name='value_func', ), dict(obj=self.policy, attr_name='policy') ], parameters=self.parameters) def warm_up(self, trajectory_data: TrajectoryData): for traj in trajectory_data.trajectories: self.scaler.update_scaler(data=traj.state_set) if self.use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): self.policy.init() self.value_func.init() self.parameters.init() if source_obj: self.copy_from(source_obj) super().init() @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train', under_status='TRAIN') def train(self, trajectory_data: TrajectoryData = None, train_iter=None, sess=None) -> dict: super(PPO, self).train() if trajectory_data is None: trajectory_data = self.trajectory_memory if len(trajectory_data) == 0: raise MemoryBufferLessThanBatchSizeError( 'not enough trajectory data') for i, traj in enumerate(trajectory_data.trajectories): trajectory_data.trajectories[i].append_new_set( name='state_set', shape=self.env_spec.obs_shape, data_set=np.reshape( np.array(self.scaler.process(np.array(traj.state_set))), [-1] + list(self.env_spec.obs_shape))) trajectory_data.trajectories[i].append_new_set( name='new_state_set', shape=self.env_spec.obs_shape, data_set=np.reshape( np.array(self.scaler.process(np.array( traj.new_state_set))), [-1] + list(self.env_spec.obs_shape))) tf_sess = sess if sess else tf.get_default_session() SampleProcessor.add_estimated_v_value(trajectory_data, value_func=self.value_func) SampleProcessor.add_discount_sum_reward(trajectory_data, gamma=self.parameters('gamma')) SampleProcessor.add_gae(trajectory_data, gamma=self.parameters('gamma'), name='advantage_set', lam=self.parameters('lam'), value_func=self.value_func) trajectory_data = SampleProcessor.normalization(trajectory_data, key='advantage_set') policy_res_dict = self._update_policy( state_set=np.concatenate( [t('state_set') for t in trajectory_data.trajectories], axis=0), action_set=np.concatenate( [t('action_set') for t in trajectory_data.trajectories], axis=0), advantage_set=np.concatenate( [t('advantage_set') for t in trajectory_data.trajectories], axis=0), train_iter=train_iter if train_iter else self.parameters('policy_train_iter'), sess=tf_sess) value_func_res_dict = self._update_value_func( state_set=np.concatenate( [t('state_set') for t in trajectory_data.trajectories], axis=0), discount_set=np.concatenate( [t('discount_set') for t in trajectory_data.trajectories], axis=0), train_iter=train_iter if train_iter else self.parameters('value_func_train_iter'), sess=tf_sess) trajectory_data.reset() self.trajectory_memory.reset() return {**policy_res_dict, **value_func_res_dict} @register_counter_info_to_status_decorator(increment=1, info_key='test', under_status='TEST') def test(self, *arg, **kwargs) -> dict: return super().test(*arg, **kwargs) @register_counter_info_to_status_decorator(increment=1, info_key='predict') def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): tf_sess = sess if sess else tf.get_default_session() ac = self.policy.forward( obs=self.scaler.process( data=make_batch(obs, original_shape=self.env_spec.obs_shape)), sess=tf_sess, feed_dict=self.parameters.return_tf_parameter_feed_dict()) return ac def append_to_memory(self, samples: TrajectoryData): # todo how to make sure the data's time sequential obs_list = samples.trajectories[0].state_set for i in range(1, len(samples.trajectories)): obs_list = np.array( np.concatenate([obs_list, samples.trajectories[i].state_set], axis=0)) self.trajectory_memory.union(samples) self.scaler.update_scaler(data=np.array(obs_list)) if self.use_time_index_flag: scale_last_time_index_mean = self.scaler._mean scale_last_time_index_mean[-1] = 0 scale_last_time_index_var = self.scaler._var scale_last_time_index_var[-1] = 1000 * 1000 self.scaler.set_param(mean=scale_last_time_index_mean, var=scale_last_time_index_var) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig( ).DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _setup_policy_loss(self): """ Code clip from pat-cody Three loss terms: 1) standard policy gradient 2) D_KL(pi_old || pi_new) 3) Hinge loss on [D_KL - kl_targ]^2 See: https://arxiv.org/pdf/1707.02286.pdf """ if self.parameters('clipping_range') is not None: pg_ratio = tf.exp(self.policy.log_prob() - self.old_policy.log_prob()) clipped_pg_ratio = tf.clip_by_value( pg_ratio, 1 - self.parameters('clipping_range')[0], 1 + self.parameters('clipping_range')[1]) surrogate_loss = tf.minimum(self.advantages_ph * pg_ratio, self.advantages_ph * clipped_pg_ratio) loss = -tf.reduce_mean(surrogate_loss) else: loss1 = -tf.reduce_mean( self.advantages_ph * tf.exp(self.policy.log_prob() - self.old_policy.log_prob())) loss2 = tf.reduce_mean(self.parameters('beta') * self.kl) loss3 = self.parameters('eta') * tf.square( tf.maximum(0.0, self.kl - 2.0 * self.parameters('kl_target'))) loss = loss1 + loss2 + loss3 self.loss1 = loss1 self.loss2 = loss2 self.loss3 = loss3 if isinstance(self.policy, PlaceholderInput): reg_list = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.policy.name_scope) if len(reg_list) > 0: reg_loss = tf.reduce_sum(reg_list) loss += reg_loss optimizer = tf.train.AdamOptimizer( learning_rate=self.parameters('policy_lr') * self.parameters('lr_multiplier')) train_op = optimizer.minimize( loss, var_list=self.policy.parameters('tf_var_list')) return loss, optimizer, train_op def _setup_value_func_loss(self): # todo update the value_func design loss = tf.reduce_mean( tf.square( tf.squeeze(self.value_func.v_tensor) - self.v_func_val_ph)) reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.value_func.name_scope) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(self.parameters('value_func_lr')) train_op = optimizer.minimize( loss, var_list=self.value_func.parameters('tf_var_list')) return loss, optimizer, train_op def _update_policy(self, state_set, action_set, advantage_set, train_iter, sess): old_policy_feed_dict = dict() res = sess.run( [ getattr(self.policy, tensor[1]) for tensor in self.old_dist_tensor ], feed_dict={ self.policy.parameters('state_input'): state_set, self.policy.parameters('action_input'): action_set, **self.parameters.return_tf_parameter_feed_dict() }) for tensor, val in zip(self.old_dist_tensor, res): old_policy_feed_dict[tensor[0]] = val feed_dict = { self.policy.parameters('action_input'): action_set, self.old_policy.parameters('action_input'): action_set, self.policy.parameters('state_input'): state_set, self.advantages_ph: advantage_set, **self.parameters.return_tf_parameter_feed_dict(), **old_policy_feed_dict } average_loss, average_kl, average_entropy = 0.0, 0.0, 0.0 total_epoch = 0 kl = None for i in range(train_iter): _ = sess.run(self.policy_update_op, feed_dict=feed_dict) loss, kl, entropy = sess.run( [self.policy_loss, self.kl, self.average_entropy], feed_dict=feed_dict) average_loss += loss average_kl += kl average_entropy += entropy total_epoch = i + 1 if kl > self.parameters('kl_target', require_true_value=True) * 4: # early stopping if D_KL diverges badly break average_loss, average_kl, average_entropy = average_loss / total_epoch, average_kl / total_epoch, average_entropy / total_epoch if kl > self.parameters('kl_target', require_true_value=True ) * 2: # servo beta to reach D_KL target self.parameters.set( key='beta', new_val=np.minimum( 35, 1.5 * self.parameters('beta', require_true_value=True))) if self.parameters( 'beta', require_true_value=True) > 30 and self.parameters( 'lr_multiplier', require_true_value=True) > 0.1: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) / 1.5) elif kl < self.parameters('kl_target', require_true_value=True) / 2: self.parameters.set( key='beta', new_val=np.maximum( 1 / 35, self.parameters('beta', require_true_value=True) / 1.5)) if self.parameters('beta', require_true_value=True) < ( 1 / 30) and self.parameters('lr_multiplier', require_true_value=True) < 10: self.parameters.set( key='lr_multiplier', new_val=self.parameters('lr_multiplier', require_true_value=True) * 1.5) return dict(policy_average_loss=average_loss, policy_average_kl=average_kl, policy_average_entropy=average_entropy, policy_total_train_epoch=total_epoch) def _update_value_func(self, state_set, discount_set, train_iter, sess): y_hat = self.value_func.forward(obs=state_set).squeeze() old_exp_var = 1 - np.var(discount_set - y_hat) / np.var(discount_set) if self.value_func_train_data_buffer is None: self.value_func_train_data_buffer = (state_set, discount_set) else: self.value_func_train_data_buffer = ( np.concatenate( [self.value_func_train_data_buffer[0], state_set], axis=0), np.concatenate( [self.value_func_train_data_buffer[1], discount_set], axis=0)) if len(self.value_func_train_data_buffer[0]) > self.parameters( 'value_func_memory_size'): self.value_func_train_data_buffer = tuple( np.array(data[-self.parameters('value_func_memory_size'):]) for data in self.value_func_train_data_buffer) state_set_all, discount_set_all = self.value_func_train_data_buffer param_dict = self.parameters.return_tf_parameter_feed_dict() for i in range(train_iter): random_index = np.random.choice(np.arange(len(state_set_all)), len(state_set_all)) state_set_all = state_set_all[random_index] discount_set_all = discount_set_all[random_index] for index in range( 0, len(state_set_all) - self.parameters('value_func_train_batch_size'), self.parameters('value_func_train_batch_size')): state = np.array( state_set_all[index:index + self. parameters('value_func_train_batch_size')]) discount = discount_set_all[ index:index + self.parameters('value_func_train_batch_size')] loss, _ = sess.run( [self.value_func_loss, self.value_func_update_op], options=tf.RunOptions( report_tensor_allocations_upon_oom=True), feed_dict={ self.value_func.state_input: state, self.v_func_val_ph: discount, **param_dict }) y_hat = self.value_func.forward(obs=state_set).squeeze() loss = np.mean(np.square(y_hat - discount_set)) exp_var = 1 - np.var(discount_set - y_hat) / np.var(discount_set) return dict(value_func_loss=loss, value_func_policy_exp_var=exp_var, value_func_policy_old_exp_var=old_exp_var)
class DQN(ModelFreeAlgo, OffPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json(file_path=GlobalConfig().DEFAULT_DQN_REQUIRED_KEY_LIST) @init_func_arg_record_decorator() @typechecked def __init__(self, env_spec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, schedule_param_list=None, name: str = 'dqn', replay_buffer=None): ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) self.config = construct_dict_config(config_or_config_dict, self) if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) self.q_value_func = value_func self.state_input = self.q_value_func.state_input self.action_input = self.q_value_func.action_input self.update_target_q_every_train = self.config('UPDATE_TARGET_Q_FREQUENCY') if 'UPDATE_TARGET_Q_FREQUENCY' in \ self.config.config_dict else 1 self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='{}_param'.format(name), source_config=self.config, require_snapshot=False) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.target_q_value_func = self.q_value_func.make_copy(name_scope='{}_target_q_value_net'.format(name), name='{}_target_q_value_net'.format(name), reuse=False) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input self.td_error = self.predict_q_value - self.q_value_func.q_tensor with tf.variable_scope('train'): self.q_value_func_loss, self.optimizer, self.update_q_value_func_op = self._set_up_loss() self.update_target_q_value_func_op = self._set_up_target_update() # redundant sort operation on var_list var_list = get_tf_collection_var_list(key=tf.GraphKeys.GLOBAL_VARIABLES, scope='{}/train'.format(name)) + self.optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.q_value_func, attr_name='q_value_func'), dict(obj=self.target_q_value_func, attr_name='target_q_value_func')], parameters=self.parameters) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): super().init() self.q_value_func.init() self.target_q_value_func.init(source_obj=self.q_value_func) self.parameters.init() if source_obj: self.copy_from(source_obj) @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train_counter', under_status='TRAIN') def train(self, batch_data=None, train_iter=None, sess=None, update_target=True) -> dict: super(DQN, self).train() self.recorder.record() if batch_data and not isinstance(batch_data, TransitionData): raise TypeError() tf_sess = sess if sess else tf.get_default_session() train_iter = self.parameters("TRAIN_ITERATION") if not train_iter else train_iter average_loss = 0.0 for i in range(train_iter): if batch_data is None: train_data = self.replay_buffer.sample(batch_size=self.parameters('BATCH_SIZE')) else: train_data = batch_data _, target_q_val_on_new_s = self.predict_target_with_q_val(obs=train_data.new_state_set, batch_flag=True) target_q_val_on_new_s = np.expand_dims(target_q_val_on_new_s, axis=1) assert target_q_val_on_new_s.shape[0] == train_data.state_set.shape[0] feed_dict = { self.reward_input: np.reshape(train_data.reward_set, [-1, 1]), self.action_input: flatten_n(self.env_spec.action_space, train_data.action_set), self.state_input: train_data.state_set, self.done_input: np.reshape(train_data.done_set, [-1, 1]), self.target_q_input: target_q_val_on_new_s, **self.parameters.return_tf_parameter_feed_dict() } res, _ = tf_sess.run([self.q_value_func_loss, self.update_q_value_func_op], feed_dict=feed_dict) average_loss += res average_loss /= train_iter if update_target is True and self.get_status()['train_counter'] % self.update_target_q_every_train == 0: tf_sess.run(self.update_target_q_value_func_op, feed_dict=self.parameters.return_tf_parameter_feed_dict()) return dict(average_loss=average_loss) @register_counter_info_to_status_decorator(increment=1, info_key='test_counter', under_status='TEST') def test(self, *arg, **kwargs): return super().test(*arg, **kwargs) @register_counter_info_to_status_decorator(increment=1, info_key='predict_counter') def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): if batch_flag: action, q_val = self._predict_batch_action(obs=obs, q_value_tensor=self.q_value_func.q_tensor, action_ph=self.action_input, state_ph=self.state_input, sess=sess) else: action, q_val = self._predict_action(obs=obs, q_value_tensor=self.q_value_func.q_tensor, action_ph=self.action_input, state_ph=self.state_input, sess=sess) if not batch_flag: return int(action) else: return action.astype(np.int).tolist() def predict_target_with_q_val(self, obs: np.ndarray, sess=None, batch_flag: bool = False): if batch_flag: action, q_val = self._predict_batch_action(obs=obs, q_value_tensor=self.target_q_value_func.q_tensor, action_ph=self.target_q_value_func.action_input, state_ph=self.target_q_value_func.state_input, sess=sess) else: action, q_val = self._predict_action(obs=obs, q_value_tensor=self.target_q_value_func.q_tensor, action_ph=self.target_q_value_func.action_input, state_ph=self.target_q_value_func.state_input, sess=sess) return action, q_val # Store Transition @register_counter_info_to_status_decorator(increment=1, info_key='append_to_memory') def append_to_memory(self, samples: TransitionData): self.replay_buffer.append_batch(obs0=samples.state_set, obs1=samples.new_state_set, action=samples.action_set, reward=samples.reward_set, terminal1=samples.done_set) self._status.update_info(info_key='replay_buffer_data_total_count', increment=len(samples)) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig().DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _predict_action(self, obs: np.ndarray, q_value_tensor: tf.Tensor, action_ph: tf.Tensor, state_ph: tf.Tensor, sess=None): if self.env_spec.obs_space.contains(obs) is False: raise StateOrActionOutOfBoundError("obs {} out of bound {}".format(obs, self.env_spec.obs_space.bound())) obs = repeat_ndarray(obs, repeats=self.env_spec.flat_action_dim) tf_sess = sess if sess else tf.get_default_session() feed_dict = {action_ph: generate_n_actions_hot_code(n=self.env_spec.flat_action_dim), state_ph: obs, **self.parameters.return_tf_parameter_feed_dict()} res = tf_sess.run([q_value_tensor], feed_dict=feed_dict)[0] return np.argmax(res, axis=0), np.max(res, axis=0) def _predict_batch_action(self, obs: np.ndarray, q_value_tensor: tf.Tensor, action_ph: tf.Tensor, state_ph: tf.Tensor, sess=None): actions = [] q_values = [] for obs_i in obs: action, q_val = self._predict_action(obs=obs_i, q_value_tensor=q_value_tensor, action_ph=action_ph, state_ph=state_ph, sess=sess) actions.append(np.argmax(action, axis=0)) q_values.append(np.max(q_val, axis=0)) return np.array(actions), np.array(q_values) def _set_up_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.q_value_func.name_scope) loss = tf.reduce_sum((self.predict_q_value - self.q_value_func.q_tensor) ** 2) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('LEARNING_RATE')) optimize_op = optimizer.minimize(loss=loss, var_list=self.q_value_func.parameters('tf_var_list')) return loss, optimizer, optimize_op # update target net def _set_up_target_update(self): op = [] for var, target_var in zip(self.q_value_func.parameters('tf_var_list'), self.target_q_value_func.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return op def _evaluate_td_error(self, sess=None): # tf_sess = sess if sess else tf.get_default_session() # feed_dict = { # self.reward_input: train_data.reward_set, # self.action_input: flatten_n(self.env_spec.action_space, train_data.action_set), # self.state_input: train_data.state_set, # self.done_input: train_data.done_set, # self.target_q_input: target_q_val_on_new_s, # **self.parameters.return_tf_parameter_feed_dict() # } # td_loss = tf_sess.run([self.td_error], feed_dict=feed_dict) pass
class DDPG(ModelFreeAlgo, OffPolicyAlgo, MultiPlaceholderInput): required_key_dict = DictConfig.load_json(file_path=GlobalConfig().DEFAULT_DDPG_REQUIRED_KEY_LIST) @typechecked def __init__(self, env_spec: EnvSpec, config_or_config_dict: (DictConfig, dict), value_func: MLPQValueFunction, policy: DeterministicMLPPolicy, schedule_param_list=None, name='ddpg', replay_buffer=None): """ :param env_spec: environment specifications, like action apace or observation space :param config_or_config_dict: configuraion dictionary, like learning rate or decay, if any :param value_func: value function :param policy: agent policy :param schedule_param_list: schedule parameter list, if any initla final function to schedule learning process :param name: name of algorithm class instance :param replay_buffer: replay buffer, if any """ ModelFreeAlgo.__init__(self, env_spec=env_spec, name=name) config = construct_dict_config(config_or_config_dict, self) self.config = config self.actor = policy self.target_actor = self.actor.make_copy(name_scope='{}_target_actor'.format(self.name), name='{}_target_actor'.format(self.name), reuse=False) self.critic = value_func self.target_critic = self.critic.make_copy(name_scope='{}_target_critic'.format(self.name), name='{}_target_critic'.format(self.name), reuse=False) self.state_input = self.actor.state_input if replay_buffer: assert issubclass(replay_buffer, BaseReplayBuffer) self.replay_buffer = replay_buffer else: self.replay_buffer = UniformRandomReplayBuffer(limit=self.config('REPLAY_BUFFER_SIZE'), action_shape=self.env_spec.action_shape, observation_shape=self.env_spec.obs_shape) """ self.parameters contains all the parameters (variables) of the algorithm """ self.parameters = ParametersWithTensorflowVariable(tf_var_list=[], rest_parameters=dict(), to_scheduler_param_tuple=schedule_param_list, name='ddpg_param', source_config=config, require_snapshot=False) self._critic_with_actor_output = self.critic.make_copy(reuse=True, name='actor_input_{}'.format(self.critic.name), state_input=self.state_input, action_input=self.actor.action_tensor) self._target_critic_with_target_actor_output = self.target_critic.make_copy(reuse=True, name='target_critic_with_target_actor_output_{}'.format( self.critic.name), action_input=self.target_actor.action_tensor) with tf.variable_scope(name): self.reward_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) self.next_state_input = tf.placeholder(shape=[None, self.env_spec.flat_obs_dim], dtype=tf.float32) self.done_input = tf.placeholder(shape=[None, 1], dtype=tf.bool) self.target_q_input = tf.placeholder(shape=[None, 1], dtype=tf.float32) done = tf.cast(self.done_input, dtype=tf.float32) self.predict_q_value = (1. - done) * self.config('GAMMA') * self.target_q_input + self.reward_input with tf.variable_scope('train'): self.critic_loss, self.critic_update_op, self.target_critic_update_op, self.critic_optimizer, \ self.critic_grads = self._setup_critic_loss() self.actor_loss, self.actor_update_op, self.target_actor_update_op, self.action_optimizer, \ self.actor_grads = self._set_up_actor_loss() var_list = get_tf_collection_var_list( '{}/train'.format(name)) + self.critic_optimizer.variables() + self.action_optimizer.variables() self.parameters.set_tf_var_list(tf_var_list=sorted(list(set(var_list)), key=lambda x: x.name)) MultiPlaceholderInput.__init__(self, sub_placeholder_input_list=[dict(obj=self.target_actor, attr_name='target_actor', ), dict(obj=self.actor, attr_name='actor'), dict(obj=self.critic, attr_name='critic'), dict(obj=self.target_critic, attr_name='target_critic') ], parameters=self.parameters) @register_counter_info_to_status_decorator(increment=1, info_key='init', under_status='INITED') def init(self, sess=None, source_obj=None): self.actor.init() self.critic.init() self.target_actor.init() self.target_critic.init(source_obj=self.critic) self.parameters.init() if source_obj: self.copy_from(source_obj) super().init() @record_return_decorator(which_recorder='self') @register_counter_info_to_status_decorator(increment=1, info_key='train', under_status='TRAIN') def train(self, batch_data=None, train_iter=None, sess=None, update_target=True) -> dict: super(DDPG, self).train() if isinstance(batch_data, TrajectoryData): batch_data = batch_data.return_as_transition_data(shuffle_flag=True) tf_sess = sess if sess else tf.get_default_session() train_iter = self.parameters("TRAIN_ITERATION") if not train_iter else train_iter average_critic_loss = 0.0 average_actor_loss = 0.0 for i in range(train_iter): train_batch = self.replay_buffer.sample( batch_size=self.parameters('BATCH_SIZE')) if batch_data is None else batch_data assert isinstance(train_batch, TransitionData) critic_loss, _ = self._critic_train(train_batch, tf_sess) actor_loss, _ = self._actor_train(train_batch, tf_sess) average_actor_loss += actor_loss average_critic_loss += critic_loss if update_target: tf_sess.run([self.target_actor_update_op, self.target_critic_update_op]) return dict(average_actor_loss=average_actor_loss / train_iter, average_critic_loss=average_critic_loss / train_iter) def _critic_train(self, batch_data, sess) -> (): target_q = sess.run( self._target_critic_with_target_actor_output.q_tensor, feed_dict={ self._target_critic_with_target_actor_output.state_input: batch_data.new_state_set, self.target_actor.state_input: batch_data.new_state_set } ) loss, _, grads = sess.run( [self.critic_loss, self.critic_update_op, self.critic_grads ], feed_dict={ self.target_q_input: target_q, self.critic.state_input: batch_data.state_set, self.critic.action_input: batch_data.action_set, self.done_input: np.reshape(batch_data.done_set, [-1, 1]), self.reward_input: np.reshape(batch_data.reward_set, [-1, 1]), **self.parameters.return_tf_parameter_feed_dict() } ) return loss, grads def _actor_train(self, batch_data, sess) -> (): target_q, loss, _, grads = sess.run( [self._critic_with_actor_output.q_tensor, self.actor_loss, self.actor_update_op, self.actor_grads], feed_dict={ self.actor.state_input: batch_data.state_set, self._critic_with_actor_output.state_input: batch_data.state_set, **self.parameters.return_tf_parameter_feed_dict() } ) return loss, grads @register_counter_info_to_status_decorator(increment=1, info_key='test', under_status='TEST') def test(self, *arg, **kwargs) -> dict: return super().test(*arg, **kwargs) def predict(self, obs: np.ndarray, sess=None, batch_flag: bool = False): tf_sess = sess if sess else tf.get_default_session() feed_dict = { self.state_input: make_batch(obs, original_shape=self.env_spec.obs_shape), **self.parameters.return_tf_parameter_feed_dict() } return self.actor.forward(obs=obs, sess=tf_sess, feed_dict=feed_dict) def append_to_memory(self, samples: TransitionData): self.replay_buffer.append_batch(obs0=samples.state_set, obs1=samples.new_state_set, action=samples.action_set, reward=samples.reward_set, terminal1=samples.done_set) @record_return_decorator(which_recorder='self') def save(self, global_step, save_path=None, name=None, **kwargs): save_path = save_path if save_path else GlobalConfig().DEFAULT_MODEL_CHECKPOINT_PATH name = name if name else self.name MultiPlaceholderInput.save(self, save_path=save_path, global_step=global_step, name=name, **kwargs) return dict(check_point_save_path=save_path, check_point_save_global_step=global_step, check_point_save_name=name) @record_return_decorator(which_recorder='self') def load(self, path_to_model, model_name, global_step=None, **kwargs): MultiPlaceholderInput.load(self, path_to_model, model_name, global_step, **kwargs) return dict(check_point_load_path=path_to_model, check_point_load_global_step=global_step, check_point_load_name=model_name) def _setup_critic_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.critic.name_scope) loss = tf.reduce_sum((self.predict_q_value - self.critic.q_tensor) ** 2) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('CRITIC_LEARNING_RATE')) grad_var_pair = optimizer.compute_gradients(loss=loss, var_list=self.critic.parameters('tf_var_list')) grads = [g[0] for g in grad_var_pair] if self.parameters('critic_clip_norm') is not None: grad_var_pair, grads = clip_grad(optimizer=optimizer, loss=loss, var_list=self.critic.parameters('tf_var_list'), clip_norm=self.parameters('critic_clip_norm')) optimize_op = optimizer.apply_gradients(grad_var_pair) op = [] for var, target_var in zip(self.critic.parameters('tf_var_list'), self.target_critic.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return loss, optimize_op, op, optimizer, grads def _set_up_actor_loss(self): reg_loss = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES, scope=self.actor.name_scope) loss = -tf.reduce_mean(self._critic_with_actor_output.q_tensor) if len(reg_loss) > 0: loss += tf.reduce_sum(reg_loss) optimizer = tf.train.AdamOptimizer(learning_rate=self.parameters('CRITIC_LEARNING_RATE')) grad_var_pair = optimizer.compute_gradients(loss=loss, var_list=self.actor.parameters('tf_var_list')) grads = [g[0] for g in grad_var_pair] if self.parameters('actor_clip_norm') is not None: grad_var_pair, grads = clip_grad(optimizer=optimizer, loss=loss, var_list=self.actor.parameters('tf_var_list'), clip_norm=self.parameters('critic_clip_norm')) optimize_op = optimizer.apply_gradients(grad_var_pair) op = [] for var, target_var in zip(self.actor.parameters('tf_var_list'), self.target_actor.parameters('tf_var_list')): ref_val = self.parameters('DECAY') * target_var + (1.0 - self.parameters('DECAY')) * var op.append(tf.assign(target_var, ref_val)) return loss, optimize_op, op, optimizer, grads