def load_low_level_policy(policy_path=None, name=None): with tf_utils.get_default_session().as_default(): with tf.variable_scope(name, reuse=False): snapshot = joblib.load(policy_path) policy = snapshot["policy"] return policy
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ feeds = {self._observations_ph: batch['observations']} if self.todropoutpi: feeds[self.dropoutpi_placeholder] = 1.0 if self.batchnormpi: feeds[self.isbnpitrainmode] = False sess = tf_utils.get_default_session() mu, log_sig, log_pi, reg_loss_t = sess.run(( self.distribution.mu_t, self.distribution.log_sig_t, self.distribution.log_p_t, self.distribution._reg_loss_t, ), feeds) logger.record_tabular('policy-mus-mean', np.mean(mu)) logger.record_tabular('policy-mus-min', np.min(mu)) logger.record_tabular('policy-mus-max', np.max(mu)) logger.record_tabular('policy-mus-std', np.std(mu)) logger.record_tabular('log-sigs-mean', np.mean(log_sig)) logger.record_tabular('log-sigs-min', np.min(log_sig)) logger.record_tabular('log-sigs-max', np.max(log_sig)) logger.record_tabular('log-sigs-std', np.std(log_sig)) logger.record_tabular('log-pi-mean', np.mean(log_pi)) logger.record_tabular('log-pi-max', np.max(log_pi)) logger.record_tabular('log-pi-min', np.min(log_pi)) logger.record_tabular('log-pi-std', np.std(log_pi)) logger.record_tabular('mu-sig-output-reg', reg_loss_t)
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ feeds = {self._observations_ph: batch['observations']} sess = tf_utils.get_default_session() mus, log_sigs, log_ws = sess.run( ( self.distribution.mus_t, self.distribution.log_sigs_t, self.distribution.log_ws_t, ), feeds ) logger.record_tabular('gmm-mus-mean', np.mean(mus)) logger.record_tabular('gmm-mus-min', np.min(mus)) logger.record_tabular('gmm-mus-max', np.max(mus)) logger.record_tabular('gmm-mus-std', np.std(mus)) logger.record_tabular('gmm-log-w-mean', np.mean(log_ws)) logger.record_tabular('gmm-log-w-min', np.min(log_ws)) logger.record_tabular('gmm-log-w-max', np.max(log_ws)) logger.record_tabular('gmm-log-w-std', np.std(log_ws)) logger.record_tabular('gmm-log-sigs-mean', np.mean(log_sigs)) logger.record_tabular('gmm-log-sigs-min', np.min(log_sigs)) logger.record_tabular('gmm-log-sigs-max', np.max(log_sigs)) logger.record_tabular('gmm-log-sigs-std', np.std(log_sigs))
def pis_for(self, obs): feeds = {self._observations_ph: obs} sess = tf_utils.get_default_session() x_t = sess.run( ( self.distribution.x_t, ), feeds ) return x_t
def log_pis_for(self, obs): feeds = {self._observations_ph: obs} sess = tf_utils.get_default_session() log_pi = sess.run( ( self.distribution.log_p_t, ), feeds ) return log_pi
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ feeds = {self._observations_ph: batch['observations']} sess = tf_utils.get_default_session() probs = sess.run(self.distribution.p_all, feeds) logger.record_tabular('policy-prob-sum', np.mean(np.sum(probs, 1)))
def __init__( self, sampler, n_epochs=1000, n_train_repeat=1, n_initial_exploration_steps=10000, epoch_length=1000, eval_n_episodes=10, eval_deterministic=True, eval_render=False, control_interval=1, expert_path="dataset/hopper.npz", max_bc_iter=int(1e5), ): """ Args: n_epochs (`int`): Number of epochs to run the training for. n_train_repeat (`int`): Number of times to repeat the training for single time step. n_initial_exploration_steps: Number of steps in the beginning to take using actions drawn from a separate exploration policy. epoch_length (`int`): Epoch length. eval_n_episodes (`int`): Number of rollouts to evaluate. eval_deterministic (`int`): Whether or not to run the policy in deterministic mode when evaluating policy. eval_render (`int`): Whether or not to render the evaluation environment. """ self.sampler = sampler self._n_epochs = int(n_epochs) self._n_train_repeat = n_train_repeat self._epoch_length = epoch_length self._n_initial_exploration_steps = n_initial_exploration_steps self._control_interval = control_interval self._eval_n_episodes = eval_n_episodes self._eval_deterministic = eval_deterministic self._eval_render = eval_render self._expert_path = expert_path self._max_bc_iter = max_bc_iter self._sess = tf_utils.get_default_session() self._env = None self._policy = None self._pool = None
def __init__( self, batch_size=64, n_epochs=1000, n_train_repeat=1, epoch_length=1000, min_pool_size=10000, max_path_length=1000, eval_n_episodes=10, eval_deterministic=True, eval_render=False, ): """ Args: batch_size (`int`): Size of the sample batch to be used for training. n_epochs (`int`): Number of epochs to run the training for. n_train_repeat (`int`): Number of times to repeat the training for single time step. epoch_length (`int`): Epoch length. min_pool_size (`int`): Minimum size of the sample pool before running training. max_path_length (`int`): Number of timesteps before resetting environment and policy, and the number of paths used for evaluation rollout. eval_n_episodes (`int`): Number of rollouts to evaluate. eval_deterministic (`int`): Whether or not to run the policy in deterministic mode when evaluating policy. eval_render (`int`): Whether or not to render the evaluation environment. """ self._batch_size = batch_size self._n_epochs = n_epochs self._n_train_repeat = n_train_repeat self._epoch_length = epoch_length self._min_pool_size = min_pool_size self._max_path_length = max_path_length self._eval_n_episodes = eval_n_episodes self._eval_deterministic = eval_deterministic self._eval_render = eval_render self._sess = tf_utils.get_default_session() self._env = None self._policy = None self._pool = None
def __init__(self, sampler, n_epochs=1000, n_train_repeat=1, n_initial_exploration_steps=10000, epoch_length=1000, eval_n_episodes=10, eval_deterministic=True, eval_render=False, control_interval=1, gpu_fraction=1.0): """ Args: n_epochs (`int`): Number of epochs to run the training for. n_train_repeat (`int`): Number of times to repeat the training for single time step. n_initial_exploration_steps: Number of steps in the beginning to take using actions drawn from a separate exploration policy. epoch_length (`int`): Epoch length. eval_n_episodes (`int`): Number of rollouts to evaluate. eval_deterministic (`int`): Whether or not to run the policy in deterministic mode when evaluating policy. eval_render (`int`): Whether or not to render the evaluation environment. """ self.sampler = sampler self._n_epochs = int(n_epochs) self._n_train_repeat = n_train_repeat self._epoch_length = epoch_length self._n_initial_exploration_steps = n_initial_exploration_steps self._control_interval = control_interval self._eval_n_episodes = eval_n_episodes self._eval_deterministic = eval_deterministic self._eval_render = eval_render # Hack to get GPU fraction for parallelization. gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_fraction) config = tf.ConfigProto(gpu_options=gpu_options) self._sess = tf_utils.get_default_session(config=config) self._env = None self._policy = None self._pool = None
def __init__( self, sampler, n_epochs=1000, n_train_repeat=1, epoch_length=2000, eval_n_episodes=10, eval_n_frequency=1, eval_deterministic=True, eval_render=False, control_interval=1, ): """ Args: n_epochs (`int`): Number of epochs to run the training for. n_train_repeat (`int`): Number of times to repeat the training for single time step. epoch_length (`int`): Epoch length. eval_n_episodes (`int`): Number of rollouts to evaluate. eval_deterministic (`int`): Whether or not to run the policy in deterministic mode when evaluating policy. eval_render (`int`): Whether or not to render the evaluation environment. """ self.sampler = sampler self._n_epochs = n_epochs self._n_train_repeat = n_train_repeat self._epoch_length = epoch_length self._control_interval = control_interval self._eval_n_episodes = eval_n_episodes self._eval_n_frequency = eval_n_frequency self._eval_deterministic = eval_deterministic self._eval_render = eval_render self._sess = tf_utils.get_default_session() self._env = None self._policy = None self._pool = None self.log_writer = None
def log_diagnostics(self, iteration, batch): """Record diagnostic information to the logger. Records the mean, min, max, and standard deviation of the GMM means, component weights, and covariances. """ sess = tf_utils.get_default_session() feed = { self._observations_ph: batch["observations"], self.sub_level_actions: batch["sub_level_actions"], self.sub_level_entropies: batch["sub_level_probs"] } log_pi = sess.run(self.log_pi, feed) logger.record_tabular('log-pi-mean', np.mean(log_pi)) logger.record_tabular('log-pi-max', np.max(log_pi)) logger.record_tabular('log-pi-min', np.min(log_pi)) logger.record_tabular('log-pi-std', np.std(log_pi))
def eval(self, *inputs): feeds = {pl: val for pl, val in zip(self._input_pls, inputs)} return tf_utils.get_default_session().run(self._output_t, feeds)
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 self.prev_n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z(next_ob, z, self._num_skills) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) # print("\n===RESET", epoch, n_episodes, "===", self._epoch_length, path_length, "===", # # env._wrapped_env.env.nstep_internal, # datetime.datetime.now()) env._wrapped_env.env.initialize(seed_task=SEED_TASK) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 # EPOCH IS DONE epoch if not epoch % 10: logger.log("Epoch: {:4} | Episodes: {}".format( epoch, n_episodes), with_prefix=False) if not n_episodes % self.eval_freq or \ n_episodes >= EPISODE_LIMIT or \ epoch >= self._n_epochs: # is_final = epoch >= self._n_epochs \ # or n_episodes >= EPISODE_LIMIT self.sample_skills_to_bd(n_epoch=epoch, n_episodes=n_episodes) # Make snapshot params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) gt.stamp('behaviours') else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') # Terminate after 1000000 episodes if n_episodes >= EPISODE_LIMIT: break else: continue break if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) logger.push_prefix('Epoch #%d | ' % epoch) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def eval(self, *inputs): feeds = {pl: val for pl, val in zip(self._input_pls, inputs)} if self.todropoutvf: feeds[self.dropoutvf_placeholder] = self.dropoutvf_keep_prob return tf_utils.get_default_session().run(self._output_t, feeds)
def _train(self, env, policy, pool): """When training our policy expects an augmented observation.""" self._init_training(env, policy, pool) with self._sess.as_default(): observation = env.reset() policy.reset() log_p_z_episode = [] # Store log_p_z for this episode path_length = 0 path_return = 0 last_path_return = 0 max_path_return = -np.inf n_episodes = 0 if self._learn_p_z: log_p_z_list = [ deque(maxlen=self._max_path_length) for _ in range(self._num_skills) ] gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) path_length_list = [] z = self._sample_z() aug_obs = utils.concat_obs_z(observation, z, self._num_skills, concat_type=self.concat_type) for t in range(self._epoch_length): iteration = t + epoch * self._epoch_length action, _ = policy.get_action(aug_obs) if self._learn_p_z: (obs, _) = utils.split_aug_obs(aug_obs, self._num_skills) feed_dict = { self._discriminator._obs_pl: obs[None], self._discriminator._action_pl: action[None] } logits = tf_utils.get_default_session().run( self._discriminator._output_t, feed_dict)[0] log_p_z = np.log(utils._softmax(logits)[z]) if self._learn_p_z: log_p_z_list[z].append(log_p_z) next_ob, reward, terminal, info = env.step(action) aug_next_ob = utils.concat_obs_z( next_ob, z, self._num_skills, concat_type=self.concat_type) path_length += 1 path_return += reward self._pool.add_sample( aug_obs, action, reward, terminal, aug_next_ob, ) if terminal or path_length >= self._max_path_length: path_length_list.append(path_length) observation = env.reset() policy.reset() log_p_z_episode = [] path_length = 0 max_path_return = max(max_path_return, path_return) last_path_return = path_return path_return = 0 n_episodes += 1 else: aug_obs = aug_next_ob gt.stamp('sample') if self._pool.size >= self._min_pool_size: for i in range(self._n_train_repeat): batch = self._pool.random_batch(self._batch_size) self._do_training(iteration, batch) gt.stamp('train') if self._learn_p_z: print('learning p(z)') for z in range(self._num_skills): if log_p_z_list[z]: print( '\t skill = %d, min=%.2f, max=%.2f, mean=%.2f, len=%d' % (z, np.min( log_p_z_list[z]), np.max(log_p_z_list[z]), np.mean( log_p_z_list[z]), len(log_p_z_list[z]))) log_p_z = [ np.mean(log_p_z) if log_p_z else np.log(1.0 / self._num_skills) for log_p_z in log_p_z_list ] print('log_p_z: %s' % log_p_z) self._p_z = utils._softmax(log_p_z) self._evaluate(epoch) params = self.get_snapshot(epoch) logger.save_itr_params(epoch, params) times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) logger.record_tabular('episodes', n_episodes) logger.record_tabular('max-path-return', max_path_return) logger.record_tabular('last-path-return', last_path_return) logger.record_tabular('pool-size', self._pool.size) logger.record_tabular('path-length', np.mean(path_length_list)) logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') env.terminate()
def train(self): """ CG: the function that conducts ensemble training. :return: """ # Set up parameters for the training process. self._n_epochs = self._base_ac_params['n_epochs'] self._epoch_length = self._base_ac_params['epoch_length'] self._n_train_repeat = self._base_ac_params['n_train_repeat'] self._n_initial_exploration_steps = self._base_ac_params[ 'n_initial_exploration_steps'] self._eval_render = self._base_ac_params['eval_render'] self._eval_n_episodes = self._base_ac_params['eval_n_episodes'] self._eval_deterministic = self._base_ac_params['eval_deterministic'] # Set up the evaluation environment. if self._eval_n_episodes > 0: with tf.variable_scope("low_level_policy", reuse=True): self._eval_env = deep_clone(self._env) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() # Import required libraries for training. import random import math import operator import numpy as np # Initialize the sampler. alg_ins = random.choice(self._alg_instances) self._sampler.initialize(self._env, alg_ins[0].policy, self._pool) # Perform the training/evaluation process. num_episode = 0. with self._sess.as_default(): gt.rename_root('RLAlgorithm') gt.reset() gt.set_def_unique(False) for epoch in gt.timed_for(range(self._n_epochs + 1), save_itrs=True): logger.push_prefix('Epoch #%d | ' % epoch) for t in range(self._epoch_length): isEpisodeEnd = self._sampler.sample() # If an episode is ended, we need to update performance statistics for each AC instance and # pick randomly another AC instance for next episode of exploration. if isEpisodeEnd: num_episode = num_episode + 1. alg_ins[1] = 0.9 * alg_ins[ 1] + 0.1 * self._sampler._last_path_return alg_ins[2] = alg_ins[2] + 1. if self._use_ucb: # Select an algorithm instance based on UCB. selected = False for ains in self._alg_instances: if ains[2] < 1.: alg_ins = ains selected = True break else: ains[3] = ains[1] + math.sqrt( 2.0 * math.log(num_episode) / ains[2]) if not selected: alg_ins = max(self._alg_instances, key=operator.itemgetter(3)) else: # Select an algorithm instance uniformly at random. alg_ins = random.choice(self._alg_instances) self._sampler.set_policy(alg_ins[0].policy) if not self._sampler.batch_ready(): continue gt.stamp('sample') # Perform training over all AC instances. for i in range(self._n_train_repeat): batch = self._sampler.random_batch() for ains in self._alg_instances: ains[0]._do_training(iteration=t + epoch * self._epoch_length, batch=batch) gt.stamp('train') # Perform evaluation after one full epoch of training is completed. if self._eval_n_episodes < 1: continue if self._evaluation_strategy == 'ensemble': # Use a whole ensemble of AC instances for evaluation. paths = rollouts(self._eval_env, self, self._sampler._max_path_length, self._eval_n_episodes) elif self._evaluation_strategy == 'best-policy': # Choose the AC instance with the highest observed performance so far for evaluation. eval_alg_ins = max(self._alg_instances, key=operator.itemgetter(1)) with eval_alg_ins[0].policy.deterministic( self._eval_deterministic): paths = rollouts(self._eval_env, eval_alg_ins[0].policy, self._sampler._max_path_length, self._eval_n_episodes) else: paths = None if paths is not None: total_returns = [path['rewards'].sum() for path in paths] episode_lengths = [len(p['rewards']) for p in paths] logger.record_tabular('return-average', np.mean(total_returns)) logger.record_tabular('return-min', np.min(total_returns)) logger.record_tabular('return-max', np.max(total_returns)) logger.record_tabular('return-std', np.std(total_returns)) logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) logger.record_tabular('episode-length-min', np.min(episode_lengths)) logger.record_tabular('episode-length-max', np.max(episode_lengths)) logger.record_tabular('episode-length-std', np.std(episode_lengths)) self._eval_env.log_diagnostics(paths) if self._eval_render: self._eval_env.render(paths) # Produce log info after each episode of training and evaluation. times_itrs = gt.get_times().stamps.itrs eval_time = times_itrs['eval'][-1] if epoch > 1 else 0 total_time = gt.get_times().total logger.record_tabular('time-train', times_itrs['train'][-1]) logger.record_tabular('time-eval', eval_time) logger.record_tabular('time-sample', times_itrs['sample'][-1]) logger.record_tabular('time-total', total_time) logger.record_tabular('epoch', epoch) self._sampler.log_diagnostics() logger.dump_tabular(with_prefix=False) logger.pop_prefix() gt.stamp('eval') # Terminate the sampler after the training process is completed. self._sampler.terminate()
def __init__( self, environment_name, algorithm_name, lr, scale_reward, scale_entropy, discount, tau, max_replay_buffer_size, sampler_params, value_func_layers_number, value_func_layer_size, policy_func_layers_number, policy_func_layer_size, base_ac_alg_params, q_param_list, use_ucb=False, evaluation_strategy='ensemble', ): """ CG: the constructor. :param environment_name: the name of the environment in string. :param algorithm_name: the name of the AC algorithm to be used in the ensemble. :param lr: the learning rate to be used in the ensemble. :param scale_reward: the reward scaling factor. :param scale_entropy: the entropy scaling factor. :param discount: the reward discount factor. :param tau: the target value function updating factor. :param max_replay_buffer_size: the maximum size of the replay buffer. :param sampler_params: extra parameter settings for the random sampler. :param value_func_layers_number: the number of hidden layers for the value network, i.e. V function and Q function. :param value_func_layer_size: the number of neurons of each hidden layer of the value network. :param policy_func_layers_number: th number of hidden layers for the policy network. :param policy_func_layer_size: the number of neurons of each hidden layer of the policy network. :param base_ac_alg_params: base parameters for the AC algorithm. :param q_param_list: the list of q values for the ensemble. Each q value in the list represents one AC instance in the ensemble. :param use_ucb: an indicator regarding the use of ucb for selecting AC instances in the ensemble for exploration. :param evaluation_strategy: the strategy used for evaluation. We have two strategies available, 'ensemble' and 'best-policy'. """ # Set up the environment. self._environment_name = environment_name self._env = GymEnv(self._environment_name) # Set up the algorithm parameters. self._algorithm_name = algorithm_name self._lr = lr self._scale_reward = scale_reward self._scale_entropy = scale_entropy self._discount = discount self._tau = tau self._use_ucb = use_ucb self._evaluation_strategy = evaluation_strategy # Set up the replay buffer. self._max_replay_buffer_size = max_replay_buffer_size self._pool = SimpleReplayBuffer( env_spec=self._env.spec, max_replay_buffer_size=self._max_replay_buffer_size) # Set up the environment sampler. self._sampler_params = sampler_params self._sampler = SimpleSampler(**self._sampler_params) # Set up the required number of AC instances in the ensemble. Each AC instance has its own value network and policy network. self._alg_instances = [] self._base_ac_params = base_ac_alg_params self._base_alg_params = dict(self._base_ac_params, sampler=self._sampler) for id, q_val in enumerate(q_param_list): # Set up the value function network for an AC instance. qf1 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf1') qf2 = NNQFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'qf2') vf = NNVFunction(env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name=str(id) + 'vf') # Set up the policy network for an AC instance. policy = GaussianPolicy( env_spec=self._env.spec, hidden_layer_sizes=tuple([ policy_func_layer_size for _ in range(policy_func_layers_number) ]), squash=True, reparameterize=False, reg=1.e-3, name=str(id) + 'gaussian_policy') initial_exploration_policy = policy # Set up an AC instance. if self._algorithm_name == 'sac': algorithm = SACV1( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, ) elif self._algorithm_name == 'tac': algorithm = TAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, tsallisQ=q_val, ) elif self._algorithm_name == 'rac': algorithm = RAC( base_kwargs=self._base_alg_params, env=self._env, policy=policy, initial_exploration_policy=initial_exploration_policy, pool=self._pool, qf1=qf1, qf2=qf2, vf=vf, lr=self._lr, scale_reward=self._scale_reward, scale_entropy=self._scale_entropy, discount=self._discount, tau=self._tau, reparameterize=False, target_update_interval=1, action_prior='uniform', save_full_state=False, renyiQ=q_val, ) else: raise NotImplementedError # Initialize the AC instance. # algorithm._sess.run(tf.global_variables_initializer()) # Put the initialized AC instance into the algorithm instance list. # Each element of the algorithm instance list is made up of # the algorithm instance, # the moving average performance of the instance, # the number of times the instance has been used for exploration previously, and # the UCB bound. self._alg_instances.append([algorithm, 0.0, 0.0, 0.0]) # Set up the ensemble Q-function for action selection. self._Q_ensemble = NNQFunction( env_spec=self._env.spec, hidden_layer_sizes=tuple([ value_func_layer_size for _ in range(value_func_layers_number) ]), name='ensqf') # ======================================================================== # Set up the training target for the ensemble Q-function for action selection. # ======================================================================== # Create the observation placeholder. self._observations_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='obv_ens', ) # Create the next observation placeholder. self._observations_ens_next_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.observation_space.flat_dim), name='next_obv_ens', ) # Create a list of next action placeholders. self._acts_next_phs = [] for i in range(len(q_param_list)): act_ens_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name=str(i) + '_next_act_ens', ) self._acts_next_phs.append(act_ens_ph) # Create the observed action placeholder. self._obv_act_ph = tf.placeholder( tf.float32, shape=(None, self._env.spec.action_space.flat_dim), name='act_obv_ens', ) # Create the reward placeholder. self._rewards_ph = tf.placeholder( tf.float32, shape=(None, ), name='rew_ens', ) # Create the terminal placeholder. self._terminals_ph = tf.placeholder( tf.float32, shape=(None, ), name='ter_ens', ) # Determine the target Q-value for next step. self._q_ens_targets = [] for act_next_ph in self._acts_next_phs: qt = self._Q_ensemble.get_output_for( self._observations_ens_next_ph, act_next_ph, reuse=True) self._q_ens_targets.append(qt) for i, q_t in enumerate(self._q_ens_targets): if i == 0: self._q_ens_next = q_t else: self._q_ens_next = tf.maximum(self._q_ens_next, q_t) # self._q_ens_next = self._q_ens_next + q_t # self._q_ens_next = self._q_ens_next / len(self._q_ens_targets) # Determine the Q-loss. self._q_train = self._Q_ensemble.get_output_for( self._observations_ens_ph, self._obv_act_ph, reuse=True) self._q_ens_loss = 0.5 * tf.reduce_mean( (self._q_train - tf.stop_gradient(self._scale_reward * self._rewards_ph + (1 - self._terminals_ph) * self._discount * self._q_ens_next))**2) # Determine the Q-training operator. self._q_ens_train_operator = tf.train.AdamOptimizer(self._lr).minimize( loss=self._q_ens_loss, var_list=self._Q_ensemble.get_params_internal()) # Set up the tensor flow session. self._sess = tf_utils.get_default_session() self._sess.run(tf.global_variables_initializer())