def build_env(args, _game_envs):
    ncpu = multiprocessing.cpu_count()
    if sys.platform == 'darwin': ncpu //= 2
    alg = args.alg
    seed = args.seed

    env_type, env_id = get_env_type(args, _game_envs)
    config = tf.ConfigProto(allow_soft_placement=True,
                        intra_op_parallelism_threads=1,
                        inter_op_parallelism_threads=1)
    config.gpu_options.allow_growth = True
    get_session(config=config)

    reward_scale = args.reward_scale if hasattr(args, 'reward_scale') else 1
    flatten_dict_observations = alg not in {'her'}
    env = make_vec_env(env_id, env_type, args.num_env or 1, seed, 
                        reward_scale=reward_scale, 
                        flatten_dict_observations=flatten_dict_observations)

    if env_type == 'mujoco':
        env = VecNormalize(env, use_tf=True)
    # build one simple env without vector wrapper
    tmp_env = make_env(env_id, env_type, seed=seed,
                        reward_scale=reward_scale,
                        flatten_dict_observations=flatten_dict_observations,
                        logger_dir=logger.get_dir())

    return env, tmp_env
Example #2
0
    def __init__(self, epsilon=1e-4, shape=(), scope=''):
        sess = get_session()

        self._new_mean = tf.placeholder(shape=shape, dtype=tf.float64)
        self._new_var = tf.placeholder(shape=shape, dtype=tf.float64)
        self._new_count = tf.placeholder(shape=(), dtype=tf.float64)

        with tf.variable_scope(scope, reuse=tf.AUTO_REUSE):
            self._mean = tf.get_variable('mean',
                                         initializer=np.zeros(
                                             shape, 'float64'),
                                         dtype=tf.float64)
            self._var = tf.get_variable('std',
                                        initializer=np.ones(shape, 'float64'),
                                        dtype=tf.float64)
            self._count = tf.get_variable('count',
                                          initializer=np.full((), epsilon,
                                                              'float64'),
                                          dtype=tf.float64)

        self.update_ops = tf.group([
            self._var.assign(self._new_var),
            self._mean.assign(self._new_mean),
            self._count.assign(self._new_count)
        ])

        sess.run(tf.variables_initializer([self._mean, self._var,
                                           self._count]))
        self.sess = sess
        self._set_mean_var_count()
Example #3
0
    def __init__(self,
                 dimo,
                 dimu,
                 o_stats,
                 u_stats,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=400,
                 layers=4,
                 learning_rate=1e-3):
        self.sess = U.get_session()
        with tf.variable_scope('forward_dynamics'):
            self.obs0 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs0')
            self.obs1 = tf.placeholder(tf.float32,
                                       shape=(None, self.dimo),
                                       name='obs1')
            self.actions = tf.placeholder(tf.float32,
                                          shape=(None, self.dimu),
                                          name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            obs0_norm = self.o_stats.normalize(self.obs0)
            obs1_norm = self.o_stats.normalize(self.obs1)
            actions_norm = self.u_stats.normalize(self.actions)
            input = tf.concat(values=[obs0_norm, actions_norm], axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_denorm = self.o_stats.denormalize(
                self.next_state_diff_tf + obs0_norm)

            # no normalize
            # input = tf.concat(values=[self.obs0, self.actions], axis=-1)
            # self.next_state_diff_tf = nn(input,[hidden] * layers+ [self.dimo])
            # self.next_state_tf = self.next_state_diff_tf + self.obs0
            # self.next_state_denorm = self.next_state_tf

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - obs1_norm + obs0_norm), axis=1)
        # self.per_sample_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1), axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.test_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_denorm - self.obs1))
        # self.test_loss_tf = tf.reduce_mean(tf.abs(self.next_state_tf - self.obs1))

        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()
Example #4
0
def profile_tf_runningmeanstd():
    import time
    from mher.common import tf_util

    tf_util.get_session(config=tf.ConfigProto(inter_op_parallelism_threads=1,
                                              intra_op_parallelism_threads=1,
                                              allow_soft_placement=True))

    x = np.random.random((376, ))

    n_trials = 10000
    rms = RunningMeanStd()
    tfrms = TfRunningMeanStd()

    tic1 = time.time()
    for _ in range(n_trials):
        rms.update(x)

    tic2 = time.time()
    for _ in range(n_trials):
        tfrms.update(x)

    tic3 = time.time()

    print('rms update time ({} trials): {} s'.format(n_trials, tic2 - tic1))
    print('tfrms update time ({} trials): {} s'.format(n_trials, tic3 - tic2))

    tic1 = time.time()
    for _ in range(n_trials):
        z1 = rms.mean

    tic2 = time.time()
    for _ in range(n_trials):
        z2 = tfrms.mean

    assert z1 == z2

    tic3 = time.time()

    print('rms get mean time ({} trials): {} s'.format(n_trials, tic2 - tic1))
    print('tfrms get mean time ({} trials): {} s'.format(
        n_trials, tic3 - tic2))
    '''
Example #5
0
    def _create_network(self, reuse=False):
        logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(SAC_ActorCritic, reuse, batch_tf)

        # loss functions
        clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf)
        q_backup_tf = tf.stop_gradient(target_tf)
        v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf)

        q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2)
        q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2)
        v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2)
        self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf))

        self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf
        self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf)
        
        # virables
        value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v'))
        pi_params = get_var(self._name_variable('pi'))
        # gradients
        V_grads_tf = tf.gradients(self.value_loss_tf, value_params)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params)
        self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params)

        # optimizers
        self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False)

        # polyak averaging
        self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v'))
        self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False))

        self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \
                                        zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()
Example #6
0
    def __init__(self,
                 dimo,
                 dimu,
                 clip_norm=5,
                 norm_eps=1e-4,
                 hidden=256,
                 layers=8,
                 learning_rate=1e-3):
        self.obs_normalizer = NormalizerNumpy(size=dimo, eps=norm_eps)
        self.action_normalizer = NormalizerNumpy(size=dimu, eps=norm_eps)
        self.sess = U.get_session()

        with tf.variable_scope('forward_dynamics_numpy'):
            self.obs0_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs0')
            self.obs1_norm = tf.placeholder(tf.float32,
                                            shape=(None, self.dimo),
                                            name='obs1')
            self.actions_norm = tf.placeholder(tf.float32,
                                               shape=(None, self.dimu),
                                               name='actions')

            self.dynamics_scope = tf.get_variable_scope().name
            input = tf.concat(values=[self.obs0_norm, self.actions_norm],
                              axis=-1)
            self.next_state_diff_tf = nn(input,
                                         [hidden] * layers + [self.dimo])
            self.next_state_norm_tf = self.next_state_diff_tf + self.obs0_norm

        # loss functions
        self.per_sample_loss_tf = tf.reduce_mean(
            tf.abs(self.next_state_diff_tf - self.obs1_norm + self.obs0_norm),
            axis=1)
        self.mean_loss_tf = tf.reduce_mean(self.per_sample_loss_tf)
        self.dynamics_grads = U.flatgrad(self.mean_loss_tf,
                                         _vars(self.dynamics_scope),
                                         clip_norm=clip_norm)

        # optimizers
        self.dynamics_adam = MpiAdam(_vars(self.dynamics_scope),
                                     scale_grad_by_procs=False)
        # initial
        tf.variables_initializer(_vars(self.dynamics_scope)).run()
        self.dynamics_adam.sync()
Example #7
0
def test_nonfreeze():
    np.random.seed(0)
    tf.set_random_seed(0)

    a = tf.Variable(np.random.randn(3).astype('float32'))
    b = tf.Variable(np.random.randn(2, 5).astype('float32'))
    loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))

    stepsize = 1e-2
    # for some reason the session config with inter_op_parallelism_threads was causing
    # nested sess.run calls to freeze
    config = tf.ConfigProto(inter_op_parallelism_threads=1)
    sess = U.get_session(config=config)
    update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD,
                                 learning_rate=stepsize).minimize(loss)
    sess.run(tf.global_variables_initializer())
    losslist_ref = []
    for i in range(100):
        l, _ = sess.run([loss, update_op])
        print(i, l)
        losslist_ref.append(l)
Example #8
0
    def _create_network(self, reuse=False):
        logger.info("Creating a DDPG agent with action space %d x %s..." %
                    (self.dimu, self.max_u))
        self.sess = tf_util.get_session()
        # normalizer for input
        self._create_normalizer(reuse)
        batch_tf = self._get_batch_tf()

        # networks
        self._create_target_main(ActorCritic, reuse, batch_tf)

        # loss functions
        target_Q_pi_tf = self.target.Q_pi_tf
        clip_range = (-self.clip_return,
                      0. if self.clip_pos_returns else np.inf)
        target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf)

        self.abs_td_error_tf = tf.abs(
            tf.stop_gradient(target_tf) - self.main.Q_tf)
        self.Q_loss = tf.square(self.abs_td_error_tf)
        if self.priority:
            self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss)
        else:
            self.Q_loss_tf = tf.reduce_mean(self.Q_loss)
        self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
        self.pi_loss_tf += self.action_l2 * tf.reduce_mean(
            tf.square(self.main.pi_tf / self.max_u))

        # varibles
        self.main_Q_var = get_var(self.scope + '/main/Q')
        self.main_pi_var = get_var(self.scope + '/main/pi')
        self.target_Q_var = get_var(self.scope + '/target/Q')
        self.target_pi_var = get_var(self.scope + '/target/pi')

        Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var)
        pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var)
        assert len(self.main_Q_var) == len(Q_grads_tf)
        assert len(self.main_pi_var) == len(pi_grads_tf)
        self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var)
        self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var)
        self.Q_grad_tf = flatten_grads(grads=Q_grads_tf,
                                       var_list=self.main_Q_var)
        self.pi_grad_tf = flatten_grads(grads=pi_grads_tf,
                                        var_list=self.main_pi_var)

        # optimizers
        self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False)
        self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False)
        self.main_vars = self.main_Q_var + self.main_pi_var
        self.target_vars = self.target_Q_var + self.target_pi_var
        self.init_target_net_op = list(
            map(lambda v: v[0].assign(v[1]),
                zip(self.target_vars, self.main_vars)))
        self.update_target_net_op = list(
            map(
                lambda v: v[0].assign(self.polyak * v[0] +
                                      (1. - self.polyak) * v[1]),
                zip(self.target_vars, self.main_vars)))

        # initialize all variables
        self.global_vars = get_var(self.scope, key='global')
        tf.variables_initializer(self.global_vars).run()
        self._sync_optimizers()
        self._init_target_net()
def _serialize_variables():
    sess = get_session()
    variables = tf.trainable_variables()
    values = sess.run(variables)
    return {var.name: value for var, value in zip(variables, values)}