def build_env(args, _game_envs): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 alg = args.alg seed = args.seed env_type, env_id = get_env_type(args, _game_envs) config = tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) config.gpu_options.allow_growth = True get_session(config=config) reward_scale = args.reward_scale if hasattr(args, 'reward_scale') else 1 flatten_dict_observations = alg not in {'her'} env = make_vec_env(env_id, env_type, args.num_env or 1, seed, reward_scale=reward_scale, flatten_dict_observations=flatten_dict_observations) if env_type == 'mujoco': env = VecNormalize(env, use_tf=True) # build one simple env without vector wrapper tmp_env = make_env(env_id, env_type, seed=seed, reward_scale=reward_scale, flatten_dict_observations=flatten_dict_observations, logger_dir=logger.get_dir()) return env, tmp_env
def __init__(self, epsilon=1e-4, shape=(), scope=''): sess = get_session() self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64) self._new_var = tf.placeholder(shape=shape, dtype=tf.float64) self._new_count = tf.placeholder(shape=(), dtype=tf.float64) with tf.variable_scope(scope, reuse=tf.AUTO_REUSE): self._mean = tf.get_variable('mean', initializer=np.zeros( shape, 'float64'), dtype=tf.float64) self._var = tf.get_variable('std', initializer=np.ones(shape, 'float64'), dtype=tf.float64) self._count = tf.get_variable('count', initializer=np.full((), epsilon, 'float64'), dtype=tf.float64) self.update_ops = tf.group([ self._var.assign(self._new_var), self._mean.assign(self._new_mean), self._count.assign(self._new_count) ]) sess.run(tf.variables_initializer([self._mean, self._var, self._count])) self.sess = sess self._set_mean_var_count()
def __init__(self, dimo, dimu, o_stats, u_stats, clip_norm=5, norm_eps=1e-4, hidden=400, layers=4, learning_rate=1e-3): self.sess = U.get_session() with tf.variable_scope('forward_dynamics'): self.obs0 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1 = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name obs0_norm = self.o_stats.normalize(self.obs0) obs1_norm = self.o_stats.normalize(self.obs1) actions_norm = self.u_stats.normalize(self.actions) input = tf.concat(values=[obs0_norm, actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_denorm = self.o_stats.denormalize( self.next_state_diff_tf + obs0_norm) # no normalize # input = tf.concat(values=[self.obs0, self.actions], axis=-1) # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo]) # self.next_state_tf = self.next_state_diff_tf + self.obs0 # self.next_state_denorm = self.next_state_tf # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1) # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.test_loss_tf = tf.reduce_mean( tf.abs(self.next_state_denorm - self.obs1)) # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1)) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync()
def profile_tf_runningmeanstd(): import time from mher.common import tf_util tf_util.get_session(config=tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1, allow_soft_placement=True)) x = np.random.random((376, )) n_trials = 10000 rms = RunningMeanStd() tfrms = TfRunningMeanStd() tic1 = time.time() for _ in range(n_trials): rms.update(x) tic2 = time.time() for _ in range(n_trials): tfrms.update(x) tic3 = time.time() print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2)) tic1 = time.time() for _ in range(n_trials): z1 = rms.mean tic2 = time.time() for _ in range(n_trials): z2 = tfrms.mean assert z1 == z2 tic3 = time.time() print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1)) print('tfrms get mean time ({} trials): {} s'.format( n_trials, tic3 - tic2)) '''
def _create_network(self, reuse=False): logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(SAC_ActorCritic, reuse, batch_tf) # loss functions clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf) q_backup_tf = tf.stop_gradient(target_tf) v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf) q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2) q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2) v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2) self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf)) self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf) # virables value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v')) pi_params = get_var(self._name_variable('pi')) # gradients V_grads_tf = tf.gradients(self.value_loss_tf, value_params) pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params) self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params) # optimizers self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False) self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False) # polyak averaging self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v')) self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False)) self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \ zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net()
def __init__(self, dimo, dimu, clip_norm=5, norm_eps=1e-4, hidden=256, layers=8, learning_rate=1e-3): self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps) self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps) self.sess = U.get_session() with tf.variable_scope('forward_dynamics_numpy'): self.obs0_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs0') self.obs1_norm = tf.placeholder(tf.float32, shape=(None, self.dimo), name='obs1') self.actions_norm = tf.placeholder(tf.float32, shape=(None, self.dimu), name='actions') self.dynamics_scope = tf.get_variable_scope().name input = tf.concat(values=[self.obs0_norm, self.actions_norm], axis=-1) self.next_state_diff_tf = nn(input, [hidden] * layers + [self.dimo]) self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm # loss functions self.per_sample_loss_tf = tf.reduce_mean( tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm), axis=1) self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf) self.dynamics_grads = U.flatgrad(self.mean_loss_tf, _vars(self.dynamics_scope), clip_norm=clip_norm) # optimizers self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope), scale_grad_by_procs=False) # initial tf.variables_initializer(_vars(self.dynamics_scope)).run() self.dynamics_adam.sync()
def test_nonfreeze(): np.random.seed(0) tf.set_random_seed(0) a = tf.Variable(np.random.randn(3).astype('float32')) b = tf.Variable(np.random.randn(2, 5).astype('float32')) loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) stepsize = 1e-2 # for some reason the session config with inter_op_parallelism_threads was causing # nested sess.run calls to freeze config = tf.ConfigProto(inter_op_parallelism_threads=1) sess = U.get_session(config=config) update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss) sess.run(tf.global_variables_initializer()) losslist_ref = [] for i in range(100): l, _ = sess.run([loss, update_op]) print(i, l) losslist_ref.append(l)
def _create_network(self, reuse=False): logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u)) self.sess = tf_util.get_session() # normalizer for input self._create_normalizer(reuse) batch_tf = self._get_batch_tf() # networks self._create_target_main(ActorCritic, reuse, batch_tf) # loss functions target_Q_pi_tf = self.target.Q_pi_tf clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf) target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf) self.abs_td_error_tf = tf.abs( tf.stop_gradient(target_tf) - self.main.Q_tf) self.Q_loss = tf.square(self.abs_td_error_tf) if self.priority: self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss) else: self.Q_loss_tf = tf.reduce_mean(self.Q_loss) self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf) self.pi_loss_tf += self.action_l2 * tf.reduce_mean( tf.square(self.main.pi_tf / self.max_u)) # varibles self.main_Q_var = get_var(self.scope + '/main/Q') self.main_pi_var = get_var(self.scope + '/main/pi') self.target_Q_var = get_var(self.scope + '/target/Q') self.target_pi_var = get_var(self.scope + '/target/pi') Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var) pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var) assert len(self.main_Q_var) == len(Q_grads_tf) assert len(self.main_pi_var) == len(pi_grads_tf) self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var) self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var) self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self.main_Q_var) self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self.main_pi_var) # optimizers self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False) self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False) self.main_vars = self.main_Q_var + self.main_pi_var self.target_vars = self.target_Q_var + self.target_pi_var self.init_target_net_op = list( map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars))) self.update_target_net_op = list( map( lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), zip(self.target_vars, self.main_vars))) # initialize all variables self.global_vars = get_var(self.scope, key='global') tf.variables_initializer(self.global_vars).run() self._sync_optimizers() self._init_target_net()
def _serialize_variables(): sess = get_session() variables = tf.trainable_variables() values = sess.run(variables) return {var.name: value for var, value in zip(variables, values)}