Beispiel #1
0
class AgentLayer(tf.keras.layers.Layer):
    def __init__(self, *args, **kwargs):

        argi = 0
        for arg in args:
            if (argi == 0):
                self.sess = arg
            if (argi == 1):
                self.get_state_size = arg
            if (argi == 2):
                self.get_goal_size = arg
            if (argi == 3):
                self.get_action_size = arg

            Logger.print("argi: " + str(argi))
            argi = argi + 1

        super(AgentLayer, self).__init__(*args, **kwargs)

    def build(self, input_shape):
        self.w = self.add_weight(shape=input_shape[1:],
                                 dtype=tf.float32,
                                 initializer=tf.keras.initializers.ones(),
                                 regularizer=tf.keras.regularizers.l2(0.02),
                                 trainable=True)

        with tf.device('cpu:0'):
            with self.sess.as_default():  #, self.graph.as_default():
                # with scope agent
                # with scope resource
                #with tf.variable_scope(self.RESOURCE_SCOPE):
                #with tf.variable_scope(self.RESOURCE_SCOPE):
                self.s_norm = TFNormalizer(
                    self.sess, 's_norm', self.get_state_size(),
                    self.world.env.build_state_norm_groups(self.id))
                self.s_norm.set_mean_std(
                    -self.world.env.build_state_offset(self.id),
                    1 / self.world.env.build_state_scale(self.id))

                self.g_norm = TFNormalizer(
                    self.sess, 'g_norm', self.get_goal_size(),
                    self.world.env.build_goal_norm_groups(self.id))
                self.g_norm.set_mean_std(
                    -self.world.env.build_goal_offset(self.id),
                    1 / self.world.env.build_goal_scale(self.id))

                self.a_norm = TFNormalizer(self.sess, 'a_norm',
                                           self.get_action_size())
                self.a_norm.set_mean_std(
                    -self.world.env.build_action_offset(self.id),
                    1 / self.world.env.build_action_scale(self.id))

    # Call method will sometimes get used in graph mode,
    # training will get turned into a tensor
    @tf.function
    def call(self, inputs, training=None):
        if training:
            return inputs + self.w
        else:
            return inputs + self.w * 0.5
Beispiel #2
0
class PGAgent(TFAgent):
    NAME = 'PG'

    ACTOR_NET_KEY = 'ActorNet'
    ACTOR_STEPSIZE_KEY = 'ActorStepsize'
    ACTOR_MOMENTUM_KEY = 'ActorMomentum'
    ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
    ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'

    CRITIC_NET_KEY = 'CriticNet'
    CRITIC_STEPSIZE_KEY = 'CriticStepsize'
    CRITIC_MOMENTUM_KEY = 'CriticMomentum'
    CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'

    EXP_ACTION_FLAG = 1 << 0

    def __init__(self, world, id, json_data):
        self._exp_action = False
        super().__init__(world, id, json_data)
        return

    def reset(self):
        super().reset()
        self._exp_action = False
        return

    def _check_action_space(self):
        action_space = self.get_action_space()
        return action_space == ActionSpace.Continuous

    def _load_params(self, json_data):
        super()._load_params(json_data)
        self.val_min, self.val_max = self._calc_val_bounds(self.discount)
        self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
        return

    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (
            self.ACTOR_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]

        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size],
                                   name="s")  # observations
        self.tar_val_tf = tf.placeholder(tf.float32,
                                         shape=[None],
                                         name="tar_val")  # target value s
        self.adv_tf = tf.placeholder(tf.float32, shape=[None],
                                     name="adv")  # advantage
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size],
                                   name="a")  # target actions
        self.g_tf = tf.placeholder(
            tf.float32,
            shape=([None, g_size] if self.has_goal() else None),
            name="g")  # goals

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.actor_tf = self._build_net_actor(actor_net_name,
                                                      actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.actor_tf != None):
            Logger.print('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print('Built critic net: ' + critic_net_name)

        return

    def _build_normalizers(self):
        super()._build_normalizers()
        with self.sess.as_default(), self.graph.as_default(
        ), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                val_offset, val_scale = self._calc_val_offset_scale(
                    self.discount)
                self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
                self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
        return

    def _init_normalizers(self):
        super()._init_normalizers()
        with self.sess.as_default(), self.graph.as_default():
            self.val_norm.update()
        return

    def _load_normalizers(self):
        super()._load_normalizers()
        self.val_norm.load()
        return

    def _build_losses(self, json_data):
        actor_weight_decay = 0 if (
            self.ACTOR_WEIGHT_DECAY_KEY
            not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
        critic_weight_decay = 0 if (
            self.CRITIC_WEIGHT_DECAY_KEY
            not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

        norm_val_diff = self.val_norm.normalize_tf(
            self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

        if (critic_weight_decay != 0):
            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss(
                'main/critic')

        norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
        norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf

        self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
        self.actor_loss_tf *= self.adv_tf
        self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)

        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
        a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min,
                                              norm_a_bound_max)
        a_bound_loss /= self.exp_params_curr.noise
        self.actor_loss_tf += a_bound_loss

        if (actor_weight_decay != 0):
            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss(
                'main/actor')

        return

    def _build_solvers(self, json_data):
        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data
                                   ) else json_data[self.ACTOR_STEPSIZE_KEY]
        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data
                                 ) else json_data[self.ACTOR_MOMENTUM_KEY]
        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data
                                   ) else json_data[self.CRITIC_STEPSIZE_KEY]
        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data
                                  ) else json_data[self.CRITIC_MOMENTUM_KEY]

        critic_vars = self._tf_vars('main/critic')
        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize,
                                                momentum=critic_momentum)
        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

        actor_vars = self._tf_vars('main/actor')
        actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize,
                                               momentum=actor_momentum)
        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

        return

    def _build_net_actor(self, net_name, init_output_scale):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]

        h = NetBuilder.build_net(net_name, input_tfs)
        norm_a_tf = tf.layers.dense(
            inputs=h,
            units=self.get_action_size(),
            activation=None,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-init_output_scale, maxval=init_output_scale))

        a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
        return a_tf

    def _build_net_critic(self, net_name):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]

        h = NetBuilder.build_net(net_name, input_tfs)
        norm_val_tf = tf.layers.dense(
            inputs=h,
            units=1,
            activation=None,
            kernel_initializer=TFUtil.xavier_initializer)

        norm_val_tf = tf.reshape(norm_val_tf, [-1])
        val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
        return val_tf

    def _initialize_vars(self):
        super()._initialize_vars()
        self._sync_solvers()
        return

    def _sync_solvers(self):
        self.actor_solver.sync()
        self.critic_solver.sync()
        return

    def _decide_action(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            self._exp_action = False
            a = self._eval_actor(s, g)[0]
            logp = 0

            if self._enable_stoch_policy():
                # epsilon-greedy
                rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
                if rand_action:
                    norm_exp_noise = np.random.randn(*a.shape)
                    norm_exp_noise *= self.exp_params_curr.noise
                    exp_noise = norm_exp_noise * self.a_norm.std
                    a += exp_noise

                    logp = self._calc_action_logp(norm_exp_noise)
                    self._exp_action = True

        return a, logp

    def _enable_stoch_policy(self):
        return self.enable_training and (self._mode == self.Mode.TRAIN
                                         or self._mode == self.Mode.TRAIN_END)

    def _eval_actor(self, s, g):
        s = np.reshape(s, [-1, self.get_state_size()])
        g = np.reshape(g,
                       [-1, self.get_goal_size()]) if self.has_goal() else None

        feed = {self.s_tf: s, self.g_tf: g}

        a = self.actor_tf.eval(feed)
        return a

    def _eval_critic(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            s = np.reshape(s, [-1, self.get_state_size()])
            g = np.reshape(
                g, [-1, self.get_goal_size()]) if self.has_goal() else None

            feed = {self.s_tf: s, self.g_tf: g}

            val = self.critic_tf.eval(feed)
        return val

    def _record_flags(self):
        flags = int(0)
        if (self._exp_action):
            flags = flags | self.EXP_ACTION_FLAG
        return flags

    def _train_step(self):
        super()._train_step()

        critic_loss = self._update_critic()
        actor_loss = self._update_actor()
        critic_loss = MPIUtil.reduce_avg(critic_loss)
        actor_loss = MPIUtil.reduce_avg(actor_loss)

        critic_stepsize = self.critic_solver.get_stepsize()
        actor_stepsize = self.actor_solver.get_stepsize()

        self.logger.log_tabular('Critic_Loss', critic_loss)
        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
        self.logger.log_tabular('Actor_Loss', actor_loss)
        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)

        return

    def _update_critic(self):
        idx = self.replay_buffer.sample(self._local_mini_batch_size)
        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if self.has_goal() else None

        tar_V = self._calc_updated_vals(idx)
        tar_V = np.clip(tar_V, self.val_min, self.val_max)

        feed = {self.s_tf: s, self.g_tf: g, self.tar_val_tf: tar_V}

        loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf],
                                    feed)
        self.critic_solver.update(grads)
        return loss

    def _update_actor(self):
        key = self.EXP_ACTION_FLAG
        idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size,
                                                 key)
        has_goal = self.has_goal()

        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if has_goal else None
        a = self.replay_buffer.get('actions', idx)

        V_new = self._calc_updated_vals(idx)
        V_old = self._eval_critic(s, g)
        adv = V_new - V_old

        feed = {self.s_tf: s, self.g_tf: g, self.a_tf: a, self.adv_tf: adv}

        loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf],
                                    feed)
        self.actor_solver.update(grads)

        return loss

    def _calc_updated_vals(self, idx):
        r = self.replay_buffer.get('rewards', idx)

        if self.discount == 0:
            new_V = r
        else:
            next_idx = self.replay_buffer.get_next_idx(idx)
            s_next = self.replay_buffer.get('states', next_idx)
            g_next = self.replay_buffer.get(
                'goals', next_idx) if self.has_goal() else None

            is_end = self.replay_buffer.is_path_end(idx)
            is_fail = self.replay_buffer.check_terminal_flag(
                idx, Env.Terminate.Fail)
            is_succ = self.replay_buffer.check_terminal_flag(
                idx, Env.Terminate.Succ)
            is_fail = np.logical_and(is_end, is_fail)
            is_succ = np.logical_and(is_end, is_succ)

            V_next = self._eval_critic(s_next, g_next)
            V_next[is_fail] = self.val_fail
            V_next[is_succ] = self.val_succ

            new_V = r + self.discount * V_next
        return new_V

    def _calc_action_logp(self, norm_action_deltas):
        # norm action delta are for the normalized actions (scaled by self.a_norm.std)
        stdev = self.exp_params_curr.noise
        assert stdev > 0

        a_size = self.get_action_size()
        logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas),
                                               axis=-1)
        logp += -0.5 * a_size * np.log(2 * np.pi)
        logp += -a_size * np.log(stdev)
        return logp

    def _log_val(self, s, g):
        val = self._eval_critic(s, g)
        norm_val = self.val_norm.normalize(val)
        self.world.env.log_val(self.id, norm_val[0])
        return

    def _build_replay_buffer(self, buffer_size):
        super()._build_replay_buffer(buffer_size)
        self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
        return
Beispiel #3
0
class TFAgent(RLAgent):
    RESOURCE_SCOPE = 'resource'
    SOLVER_SCOPE = 'solvers'

    def __init__(self, world, id, json_data):
        self.tf_scope = 'agent'
        self.graph = tf.Graph()
        self.sess = tf.Session(graph=self.graph)

        super().__init__(world, id, json_data)
        self._build_graph(json_data)
        self._init_normalizers()
        return

    def __del__(self):
        self.sess.close()
        return

    def save_model(self, out_path):
        with self.sess.as_default(), self.graph.as_default():
            try:
                save_path = self.saver.save(self.sess, out_path, write_meta_graph=False, write_state=False)
                Logger.print2('Model saved to: ' + save_path)
            except:
                Logger.print2("Failed to save model to: " + save_path)
        return

    def load_model(self, in_path):
        with self.sess.as_default(), self.graph.as_default():
            self.saver.restore(self.sess, in_path)
            self._load_normalizers()
            Logger.print2('Model loaded from: ' + in_path)
        return

    def _get_output_path(self):
        assert(self.output_dir != '')
        file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
        return file_path

    def _get_int_output_path(self):
        assert(self.int_output_dir != '')
        file_path = self.int_output_dir + ('/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(self.id, self.id, self.iter)
        return file_path

    def _build_graph(self, json_data):
        with self.sess.as_default(), self.graph.as_default():
            with tf.variable_scope(self.tf_scope):
                self._build_nets(json_data)
                
                with tf.variable_scope(self.SOLVER_SCOPE):
                    self._build_losses(json_data)
                    self._build_solvers(json_data)

                self._initialize_vars()
                self._build_saver()
        return

    def _init_normalizers(self):
        with self.sess.as_default(), self.graph.as_default():
            # update normalizers to sync the tensorflow tensors
            self.s_norm.update()
            self.g_norm.update()
            self.a_norm.update()
        return

    @abstractmethod
    def _build_nets(self, json_data):
        pass

    @abstractmethod
    def _build_losses(self, json_data):
        pass

    @abstractmethod
    def _build_solvers(self, json_data):
        pass

    def _tf_vars(self, scope=''):
        with self.sess.as_default(), self.graph.as_default():
            res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.tf_scope + '/' + scope)
            assert len(res) > 0
        return res

    def _build_normalizers(self):
        with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                self.s_norm = TFNormalizer(self.sess, 's_norm', self.get_state_size(), self.world.env.build_state_norm_groups(self.id))
                state_offset = -self.world.env.build_state_offset(self.id)
                print("state_offset=",state_offset)
                state_scale = 1 / self.world.env.build_state_scale(self.id)
                print("state_scale=",state_scale)
                self.s_norm.set_mean_std(-self.world.env.build_state_offset(self.id), 
                                         1 / self.world.env.build_state_scale(self.id))
                
                self.g_norm = TFNormalizer(self.sess, 'g_norm', self.get_goal_size(), self.world.env.build_goal_norm_groups(self.id))
                self.g_norm.set_mean_std(-self.world.env.build_goal_offset(self.id), 
                                         1 / self.world.env.build_goal_scale(self.id))

                self.a_norm = TFNormalizer(self.sess, 'a_norm', self.get_action_size())
                self.a_norm.set_mean_std(-self.world.env.build_action_offset(self.id), 
                                         1 / self.world.env.build_action_scale(self.id))
        return

    def _load_normalizers(self):
        self.s_norm.load()
        self.g_norm.load()
        self.a_norm.load()
        return

    def _update_normalizers(self):
        with self.sess.as_default(), self.graph.as_default():
            super()._update_normalizers()
        return

    def _initialize_vars(self):
        self.sess.run(tf.global_variables_initializer())
        return

    def _build_saver(self):
        vars = self._get_saver_vars()
        self.saver = tf.train.Saver(vars, max_to_keep=0)
        return

    def _get_saver_vars(self):
        with self.sess.as_default(), self.graph.as_default():
            vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.tf_scope)
            vars = [v for v in vars if '/' + self.SOLVER_SCOPE + '/' not in v.name]
            #vars = [v for v in vars if '/target/' not in v.name]
            assert len(vars) > 0
        return vars
    
    def _weight_decay_loss(self, scope):
        vars = self._tf_vars(scope)
        vars_no_bias = [v for v in vars if 'bias' not in v.name]
        loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
        return loss

    def _train(self):
        with self.sess.as_default(), self.graph.as_default():
            super()._train()
        return
Beispiel #4
0
class PGAgent(TFAgent):
    NAME = 'PG'

    ACTOR_NET_KEY = 'ActorNet'
    ACTOR_STEPSIZE_KEY = 'ActorStepsize'
    ACTOR_MOMENTUM_KEY = 'ActorMomentum'
    ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
    ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'

    CRITIC_NET_KEY = 'CriticNet'
    CRITIC_STEPSIZE_KEY = 'CriticStepsize'
    CRITIC_MOMENTUM_KEY = 'CriticMomentum'
    CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'
    
    EXP_ACTION_FLAG = 1 << 0

    def __init__(self, world, id, json_data): 
        self._exp_action = False
        super().__init__(world, id, json_data)
        return

    def reset(self):
        super().reset()
        self._exp_action = False
        return

    def _check_action_space(self):
        return True

    def _load_params(self, json_data):
        super()._load_params(json_data)
        return

    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
        
        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations
        self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s
        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions
        self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.actor_tf != None):
            Logger.print('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print('Built critic net: ' + critic_net_name)

        return

    def _build_normalizers(self):
        super()._build_normalizers()
        with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                val_offset, val_scale = self._calc_val_offset_scale(self.discount)
                self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
                self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
        return

    def _init_normalizers(self):
        super()._init_normalizers()
        with self.sess.as_default(), self.graph.as_default():
            self.val_norm.update()
        return

    def _load_normalizers(self):
        super()._load_normalizers()
        self.val_norm.load()
        return

    def _build_losses(self, json_data):
        actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
        critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

        norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

        if (critic_weight_decay != 0):
            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
        
        norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
        norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf

        self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
        self.actor_loss_tf *= self.adv_tf
        self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)

        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
        a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
        a_bound_loss /= self.exp_params_curr.noise
        self.actor_loss_tf += a_bound_loss

        if (actor_weight_decay != 0):
            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
        
        return

    def _build_solvers(self, json_data):
        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
        
        critic_vars = self._tf_vars('main/critic')
        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

        actor_vars = self._tf_vars('main/actor')
        actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

        return

    def _build_net_actor(self, net_name, init_output_scale):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]
        
        h = NetBuilder.build_net(net_name, input_tfs)
        norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None,
                                kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale))
        
        a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
        return a_tf
    
    def _build_net_critic(self, net_name):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]
        
        h = NetBuilder.build_net(net_name, input_tfs)
        norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None,
                                kernel_initializer=TFUtil.xavier_initializer);

        norm_val_tf = tf.reshape(norm_val_tf, [-1])
        val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
        return val_tf

    def _initialize_vars(self):
        super()._initialize_vars()
        self._sync_solvers()
        return

    def _sync_solvers(self):
        self.actor_solver.sync()
        self.critic_solver.sync()
        return

    def _decide_action(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            self._exp_action = False
            a = self._eval_actor(s, g)[0]
            logp = 0

            if self._enable_stoch_policy():
                # epsilon-greedy
                rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
                if rand_action:
                    norm_exp_noise = np.random.randn(*a.shape)
                    norm_exp_noise *= self.exp_params_curr.noise
                    exp_noise = norm_exp_noise * self.a_norm.std
                    a += exp_noise

                    logp = self._calc_action_logp(norm_exp_noise)
                    self._exp_action = True

        return a, logp

    def _enable_stoch_policy(self):
        return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END)

    def _eval_actor(self, s, g):
        s = np.reshape(s, [-1, self.get_state_size()])
        g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
          
        feed = {
            self.s_tf : s,
            self.g_tf : g
        }

        a = self.actor_tf.eval(feed)
        return a
    
    def _eval_critic(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            s = np.reshape(s, [-1, self.get_state_size()])
            g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None

            feed = {
                self.s_tf : s,
                self.g_tf : g
            }

            val = self.critic_tf.eval(feed)    
        return val

    def _calc_action_logp(self, norm_action_deltas):
        # norm action delta are for the normalized actions (scaled by self.a_norm.std) 
        stdev = self.exp_params_curr.noise
        assert stdev > 0

        a_size = self.get_action_size()
        logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
        logp += -0.5 * a_size * np.log(2 * np.pi)
        logp += -a_size * np.log(stdev)
        return logp
Beispiel #5
0
class TFAgent(RLAgent):
    RESOURCE_SCOPE = 'resource'
    SOLVER_SCOPE = 'solvers'

    def __init__(self, world, id, json_data):
        self.tf_scope = 'agent'
        #self.graph = tf.Graph()
        #        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction = 0.33)
        #       gpu_options = tf.GPUOptions(allow_growth = True)
        #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8,allow_growth=True)

        #config = tf.ConfigProto(gpu_options = gpu_options)
        #config.gpu_options.per_process_gpu_memory_fraction = 0.2
        #config.gpu_options.allow_growth = True

        #import tensorflow as tf
        #tf.config.gpu.set_per_process_memory_fraction(0.75)
        #tf.config.gpu.set_per_process_memory_growth(True)
        #session = InteractiveSession(config=config)
        #self.sess = tf.Session(config=config,graph=self.graph)

        self.init_session_mem()

        self.agent_layer = AgentLayer(self.sess, self.get_state_size(),
                                      self.get_goal_size(),
                                      self.get_action_size())
        #Logger.print('agent_layer([1]).numpy(): ' + self.agent_layer([1]).numpy())

        self.json_data = json_data
        super().__init__(world, id, json_data)
        self._build_graph(json_data)
        self._init_normalizers()
        return

    def __del__(self):
        with tf.device('cpu:0'):
            self.sess.close()
        return

    def init_session_mem(self):
        Logger.print(
            '[TFAgent] Init session -> tf.Graph(), tf.Session(...) called.')

        with tf.device('cpu:0'):
            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8,
                                        allow_growth=True)
            config = tf.ConfigProto(gpu_options=gpu_options)
            self.graph = tf.Graph()
            self.sess = tf.Session(config=config, graph=self.graph)
        return

    def clear_session_mem(self):

        with tf.device('cpu:0'):

            Logger.print(
                '[TFAgent] Clear session -> tf.Graph(), tf.Session(...) called.'
            )
            self.sess.close()

            gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8,
                                        allow_growth=True)
            config = tf.ConfigProto(gpu_options=gpu_options)
            self.graph = tf.Graph()
            self.sess = tf.Session(config=config, graph=self.graph)
        return

    def save_model(self, out_path):
        with tf.device('cpu:0'):
            with self.sess.as_default(), self.graph.as_default():
                try:
                    save_path = self.saver.save(self.sess,
                                                out_path,
                                                write_meta_graph=False,
                                                write_state=False)
                    Logger.print('Model saved to: ' + save_path)
                except:
                    Logger.print("Failed to save model to: " + save_path)
        return

    def load_model(self, in_path):
        with tf.device('cpu:0'):
            with self.sess.as_default(), self.graph.as_default():
                Logger.print('Restoring checkpoint for model: ' + in_path)
                self.saver.restore(self.sess, in_path)
                self._load_normalizers()
                Logger.print('Model loaded from: ' + in_path)
        return

    def _get_output_path(self):
        assert (self.output_dir != '')
        file_path = self.output_dir + '/agent' + str(self.id) + '_model.ckpt'
        return file_path

    def _get_int_output_path(self):
        assert (self.int_output_dir != '')
        file_path = self.int_output_dir + (
            '/agent{:d}_models/agent{:d}_int_model_{:010d}.ckpt').format(
                self.id, self.id, self.iter)
        return file_path

    def _build_graph(self, json_data):
        with tf.device('cpu:0'):

            with self.sess.as_default(), self.graph.as_default():
                with tf.variable_scope(self.tf_scope):
                    self._build_nets(json_data)

                    with tf.variable_scope(self.SOLVER_SCOPE):
                        self._build_losses(json_data)
                        self._build_solvers(json_data)

                    self._initialize_vars()
                    self._build_saver()
        return

    def _init_normalizers(self):
        with self.sess.as_default(), self.graph.as_default():
            # update normalizers to sync the tensorflow tensors
            self.s_norm.update()
            self.g_norm.update()
            self.a_norm.update()
        return

    @abstractmethod
    def _build_nets(self, json_data):
        pass

    @abstractmethod
    def _build_losses(self, json_data):
        pass

    @abstractmethod
    def _build_solvers(self, json_data):
        pass

    def _tf_vars(self, scope=''):
        with tf.device('cpu:0'):
            with self.sess.as_default(), self.graph.as_default():
                res = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
                                        scope=self.tf_scope + '/' + scope)
                assert len(res) > 0
                return res

    def _build_normalizers(self):
        Logger.print(
            '[TFAgent] Build normalizers -> TFNormalizer (s_norm, g_norm, a_norm) called.'
        )

        with tf.device('cpu:0'):

            with self.sess.as_default(), self.graph.as_default(
            ), tf.variable_scope(self.tf_scope):
                with tf.variable_scope(self.RESOURCE_SCOPE):
                    #with tf.variable_scope(self.RESOURCE_SCOPE):
                    self.s_norm = TFNormalizer(
                        self.sess, 's_norm', self.get_state_size(),
                        self.world.env.build_state_norm_groups(self.id))
                    self.s_norm.set_mean_std(
                        -self.world.env.build_state_offset(self.id),
                        1 / self.world.env.build_state_scale(self.id))

                    self.g_norm = TFNormalizer(
                        self.sess, 'g_norm', self.get_goal_size(),
                        self.world.env.build_goal_norm_groups(self.id))
                    self.g_norm.set_mean_std(
                        -self.world.env.build_goal_offset(self.id),
                        1 / self.world.env.build_goal_scale(self.id))

                    self.a_norm = TFNormalizer(self.sess, 'a_norm',
                                               self.get_action_size())
                    self.a_norm.set_mean_std(
                        -self.world.env.build_action_offset(self.id),
                        1 / self.world.env.build_action_scale(self.id))
        return

    def _load_normalizers(self):
        self.s_norm.load()
        self.g_norm.load()
        self.a_norm.load()
        return

    def _update_normalizers(self):
        with self.sess.as_default(), self.graph.as_default():
            super()._update_normalizers()
        return

    def _initialize_vars(self):
        with tf.device('cpu:0'):
            self.sess.run(tf.global_variables_initializer())
        return

    def _build_saver(self):
        with tf.device('cpu:0'):

            vars = self._get_saver_vars()
            self.saver = tf.train.Saver(vars, max_to_keep=0)
        return

    def _get_saver_vars(self):
        with tf.device('cpu:0'):

            with self.sess.as_default(), self.graph.as_default():
                vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                         scope=self.tf_scope)
                vars = [
                    v for v in vars
                    if '/' + self.SOLVER_SCOPE + '/' not in v.name
                ]
                #vars = [v for v in vars if '/target/' not in v.name]
                assert len(vars) > 0
        return vars

    def _weight_decay_loss(self, scope):
        with tf.device('cpu:0'):
            vars = self._tf_vars(scope)
            vars_no_bias = [v for v in vars if 'bias' not in v.name]
            loss = tf.add_n([tf.nn.l2_loss(v) for v in vars_no_bias])
            return loss

    def _train(self):
        with tf.device('cpu:0'):
            with self.sess.as_default(), self.graph.as_default():
                super()._train()
        return
Beispiel #6
0
class AMPAgent(PPOAgent):
    NAME = "AMP"

    TASK_REWARD_LERP_KEY = "TaskRewardLerp"

    DISC_NET_KEY = "DiscNet"
    DISC_INIT_OUTPUT_SCALE_KEY = "DiscInitOutputScale"
    DISC_WEIGHT_DECAY_KEY = "DiscWeightDecay"
    DISC_LOGIT_REG_WEIGHT_KEY = "DiscLogitRegWeight"
    DISC_STEPSIZE_KEY = "DiscStepSize"
    DISC_MOMENTUM_KEY = "DiscMomentum"
    DISC_BATCH_SIZE_KEY = "DiscBatchSize"
    DISC_STEPS_PER_BATCH_KEY = "DiscStepsPerBatch"
    DISC_EXPERT_BUFFER_SIZE_KEY = "DiscExpertBufferSize"
    DISC_AGENT_BUFFER_SIZE_KEY = "DiscAgentBufferSize"

    REWARD_SCALE_KEY = "RewardScale"
    DISC_GRAD_PENALTY_KEY = "DiscGradPenalty"

    DISC_LOGIT_NAME = "disc_logits"
    DISC_SCOPE = "disc"

    def __init__(self, id, world, json_data):
        super().__init__(id, world, json_data)

        self._disc_reward_mean = 0.0
        self._disc_reward_std = 0.0
        self._reward_min = np.inf
        self._reward_max = -np.inf

        self._build_disc_replay_buffer()

        return

    def __str__(self):
        info_str = super().__str__()
        info_str = info_str[:-2] + ',\n "AMPObsDim": "{:d}"'.format(
            self._get_amp_obs_size()) + info_str[-2:]
        return info_str

    def _load_params(self, json_data):
        super()._load_params(json_data)

        self._task_reward_lerp = 0.5 if (
            self.TASK_REWARD_LERP_KEY
            not in json_data) else json_data[self.TASK_REWARD_LERP_KEY]

        self._disc_batchsize = int(256) if (
            self.DISC_BATCH_SIZE_KEY not in json_data) else int(
                json_data[self.DISC_BATCH_SIZE_KEY])
        self._disc_steps_per_batch = int(1) if (
            self.DISC_STEPS_PER_BATCH_KEY not in json_data) else int(
                json_data[self.DISC_STEPS_PER_BATCH_KEY])
        self._disc_expert_buffer_size = int(100000) if (
            self.DISC_EXPERT_BUFFER_SIZE_KEY not in json_data) else int(
                json_data[self.DISC_EXPERT_BUFFER_SIZE_KEY])
        self._disc_agent_buffer_size = int(100000) if (
            self.DISC_AGENT_BUFFER_SIZE_KEY not in json_data) else int(
                json_data[self.DISC_AGENT_BUFFER_SIZE_KEY])

        return

    def _build_disc_replay_buffer(self):
        num_procs = mpi_util.get_num_procs()
        local_disc_expert_buffer_size = int(
            np.ceil(self._disc_expert_buffer_size / num_procs))
        self._disc_expert_buffer = ReplayBufferRandStorage(
            local_disc_expert_buffer_size)

        local_disc_agent_buffer_size = int(
            np.ceil(self._disc_agent_buffer_size / num_procs))
        self._disc_agent_buffer = ReplayBufferRandStorage(
            local_disc_agent_buffer_size)
        return

    def _build_normalizers(self):
        super()._build_normalizers()
        with self.sess.as_default(), self.graph.as_default(
        ), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                self._amp_obs_norm = TFNormalizer(
                    self.sess, "amp_obs_norm", self._get_amp_obs_size(),
                    self._get_amp_obs_norm_group())
                self._amp_obs_norm.set_mean_std(-self._get_amp_obs_offset(),
                                                1 / self._get_amp_obs_scale())
        return

    def _load_normalizers(self):
        super()._load_normalizers()
        self._amp_obs_norm.load()
        return

    def _update_normalizers(self):
        super()._update_normalizers()
        self._amp_obs_norm.update()
        return

    def _sync_solvers(self):
        super()._sync_solvers()
        self._disc_solver.sync()
        return

    def _build_nets(self, json_data):
        super()._build_nets(json_data)

        assert self.DISC_NET_KEY in json_data

        disc_net_name = json_data[self.DISC_NET_KEY]
        disc_init_output_scale = 1 if (
            self.DISC_INIT_OUTPUT_SCALE_KEY
            not in json_data) else json_data[self.DISC_INIT_OUTPUT_SCALE_KEY]
        self._reward_scale = 1.0 if (self.REWARD_SCALE_KEY not in json_data
                                     ) else json_data[self.REWARD_SCALE_KEY]

        amp_obs_size = self._get_amp_obs_size()

        # setup input tensors
        self._amp_obs_expert_ph = tf.placeholder(tf.float32,
                                                 shape=[None, amp_obs_size],
                                                 name="amp_obs_expert")
        self._amp_obs_agent_ph = tf.placeholder(tf.float32,
                                                shape=[None, amp_obs_size],
                                                name="amp_obs_agent")

        self._disc_expert_inputs = self._get_disc_expert_inputs()
        self._disc_agent_inputs = self._get_disc_agent_inputs()

        with tf.variable_scope(self.MAIN_SCOPE):
            with tf.variable_scope(self.DISC_SCOPE):
                self._disc_logits_expert_tf = self._build_disc_net(
                    disc_net_name, self._disc_expert_inputs,
                    disc_init_output_scale)
                self._disc_logits_agent_tf = self._build_disc_net(
                    disc_net_name,
                    self._disc_agent_inputs,
                    disc_init_output_scale,
                    reuse=True)

        if (self._disc_logits_expert_tf != None):
            Logger.print("Built discriminator net: " + disc_net_name)

        self._disc_prob_agent_tf = tf.sigmoid(self._disc_logits_agent_tf)
        self._abs_logit_agent_tf = tf.reduce_mean(
            tf.abs(self._disc_logits_agent_tf))
        self._avg_prob_agent_tf = tf.reduce_mean(self._disc_prob_agent_tf)

        return

    def _build_losses(self, json_data):
        super()._build_losses(json_data)

        disc_weight_decay = 0 if (self.DISC_WEIGHT_DECAY_KEY not in json_data
                                  ) else json_data[self.DISC_WEIGHT_DECAY_KEY]
        disc_logit_reg_weight = 0 if (
            self.DISC_LOGIT_REG_WEIGHT_KEY
            not in json_data) else json_data[self.DISC_LOGIT_REG_WEIGHT_KEY]
        disc_grad_penalty = 0.0 if (
            self.DISC_GRAD_PENALTY_KEY
            not in json_data) else json_data[self.DISC_GRAD_PENALTY_KEY]

        disc_loss_expert_tf = self.build_disc_loss_pos(
            self._disc_logits_expert_tf)
        disc_loss_agent_tf = self.build_disc_loss_neg(
            self._disc_logits_agent_tf)
        disc_loss_expert_tf = tf.reduce_mean(disc_loss_expert_tf)
        disc_loss_agent_tf = tf.reduce_mean(disc_loss_agent_tf)

        self._disc_loss_tf = 0.5 * (disc_loss_agent_tf + disc_loss_expert_tf)

        self._acc_expert_tf = tf.reduce_mean(
            tf.cast(tf.greater(self._disc_logits_expert_tf, 0), tf.float32))
        self._acc_agent_tf = tf.reduce_mean(
            tf.cast(tf.less(self._disc_logits_agent_tf, 0), tf.float32))

        if (disc_weight_decay != 0):
            self._disc_loss_tf += disc_weight_decay * self._weight_decay_loss(
                self.MAIN_SCOPE + "/" + self.DISC_SCOPE)

        if (disc_logit_reg_weight != 0):
            self._disc_loss_tf += disc_logit_reg_weight * self._disc_logit_reg_loss(
            )

        if (disc_grad_penalty != 0):
            self._grad_penalty_loss_tf = self._disc_grad_penalty_loss(
                in_tfs=self._disc_expert_inputs,
                out_tf=self._disc_logits_expert_tf)
            self._disc_loss_tf += disc_grad_penalty * self._grad_penalty_loss_tf
        else:
            self._grad_penalty_loss_tf = tf.constant(0.0, dtype=tf.float32)

        return

    def _build_solvers(self, json_data):
        super()._build_solvers(json_data)

        disc_stepsize = 0.001 if (self.DISC_STEPSIZE_KEY not in json_data
                                  ) else json_data[self.DISC_STEPSIZE_KEY]
        disc_momentum = 0.9 if (self.DISC_MOMENTUM_KEY not in json_data
                                ) else json_data[self.DISC_MOMENTUM_KEY]

        disc_vars = self._tf_vars(self.MAIN_SCOPE + "/" + self.DISC_SCOPE)
        disc_opt = tf.train.MomentumOptimizer(learning_rate=disc_stepsize,
                                              momentum=disc_momentum)
        self._disc_grad_tf = tf.gradients(self._disc_loss_tf, disc_vars)
        self._disc_solver = mpi_solver.MPISolver(self.sess, disc_opt,
                                                 disc_vars)

        return

    def _build_disc_net(self,
                        net_name,
                        input_tfs,
                        init_output_scale,
                        reuse=False):
        out_size = 1
        h = net_builder.build_net(net_name, input_tfs, reuse)
        logits_tf = tf.layers.dense(
            inputs=h,
            units=out_size,
            activation=None,
            reuse=reuse,
            kernel_initializer=tf.random_uniform_initializer(
                minval=-init_output_scale, maxval=init_output_scale),
            name=self.DISC_LOGIT_NAME)
        return logits_tf

    def _get_disc_expert_inputs(self):
        norm_obs_tf = self._amp_obs_norm.normalize_tf(self._amp_obs_expert_ph)
        input_tfs = [norm_obs_tf]
        return input_tfs

    def _get_disc_agent_inputs(self):
        norm_obs_tf = self._amp_obs_norm.normalize_tf(self._amp_obs_agent_ph)
        input_tfs = [norm_obs_tf]
        return input_tfs

    def _disc_logit_reg_loss(self):
        vars = self._tf_vars(self.MAIN_SCOPE + "/" + self.DISC_SCOPE)
        logit_vars = [
            v for v in vars
            if (self.DISC_LOGIT_NAME in v.name and "bias" not in v.name)
        ]
        loss_tf = tf.add_n([tf.nn.l2_loss(v) for v in logit_vars])
        return loss_tf

    def _disc_grad_penalty_loss(self, in_tfs, out_tf):
        grad_tfs = tf.gradients(ys=out_tf, xs=in_tfs)
        grad_tf = tf.concat(grad_tfs, axis=-1)
        norm_tf = tf.reduce_sum(tf.square(grad_tf), axis=-1)
        loss_tf = 0.5 * tf.reduce_mean(norm_tf)
        return loss_tf

    def reset(self):
        super().reset()

        self.path.amp_obs_expert = []
        self.path.amp_obs_agent = []
        return

    def _store_path(self, path):
        path_id = super()._store_path(path)

        valid_path = (path_id != MathUtil.INVALID_IDX)
        if (valid_path):
            disc_expert_path_id = self._disc_expert_buffer.store(
                path.amp_obs_expert)
            assert (disc_expert_path_id != MathUtil.INVALID_IDX)

            disc_agent_path_id = self._disc_agent_buffer.store(
                path.amp_obs_agent)
            assert (disc_agent_path_id != MathUtil.INVALID_IDX)

        return path_id

    def _update_new_action(self):
        first_step = self._is_first_step()

        super()._update_new_action()

        if (not first_step):
            self._record_amp_obs()

        return

    def _end_path(self):
        super()._end_path()
        self._record_amp_obs()
        return

    def _record_amp_obs(self):
        obs_expert = self._record_amp_obs_expert()
        obs_agent = self._record_amp_obs_agent()
        self.path.amp_obs_expert.append(obs_expert)
        self.path.amp_obs_agent.append(obs_agent)
        return

    def build_disc_loss_pos(self, logits_tf):
        loss_tf = 0.5 * tf.reduce_sum(tf.square(logits_tf - 1), axis=-1)
        return loss_tf

    def build_disc_loss_neg(self, logits_tf):
        loss_tf = 0.5 * tf.reduce_sum(tf.square(logits_tf + 1), axis=-1)
        return loss_tf

    def _enable_amp_task_reward(self):
        enable = self.world.env.enable_amp_task_reward()
        return enable

    def _get_amp_obs_size(self):
        amp_obs_size = self.world.env.get_amp_obs_size()
        return amp_obs_size

    def _get_amp_obs_offset(self):
        offset = np.array(self.world.env.get_amp_obs_offset())
        return offset

    def _get_amp_obs_scale(self):
        offset = np.array(self.world.env.get_amp_obs_scale())
        return offset

    def _get_amp_obs_norm_group(self):
        norm_group = np.array(self.world.env.get_amp_obs_norm_group(),
                              dtype=np.int32)
        return norm_group

    def _record_amp_obs_expert(self):
        obs_expert = np.array(self.world.env.record_amp_obs_expert(self.id))
        return obs_expert

    def _record_amp_obs_agent(self):
        obs_agent = np.array(self.world.env.record_amp_obs_agent(self.id))
        return obs_agent

    def _record_normalizers(self, path):
        super()._record_normalizers(path)

        self._amp_obs_norm.record(np.array(path.amp_obs_expert))
        self._amp_obs_norm.record(np.array(path.amp_obs_agent))
        return

    def _logits_to_reward(self, logits):
        r = 1.0 - 0.25 * np.square(1.0 - logits)
        r = np.maximum(r, 0.0)
        return r

    def _train_step(self):
        disc_info = self._update_disc()

        disc_info["reward_mean"] = self._disc_reward_mean
        disc_info["reward_std"] = self._disc_reward_std
        disc_info = mpi_util.reduce_dict_mean(disc_info)

        super()._train_step()

        self.logger.log_tabular("Disc_Loss", disc_info["loss"])
        self.logger.log_tabular("Disc_Acc_Expert", disc_info["acc_expert"])
        self.logger.log_tabular("Disc_Acc_Agent", disc_info["acc_agent"])
        self.logger.log_tabular("Disc_Stepsize", self.get_disc_stepsize())
        self.logger.log_tabular("Disc_Steps", self.get_disc_steps())
        self.logger.log_tabular("Disc_Prob", disc_info["prob_agent"])
        self.logger.log_tabular("Disc_Abs_Logit", disc_info["abs_logit"])
        self.logger.log_tabular("Disc_Reward_Mean", disc_info["reward_mean"])
        self.logger.log_tabular("Disc_Reward_Std", disc_info["reward_std"])

        if (self._enable_grad_penalty()):
            self.logger.log_tabular("Grad_Penalty", disc_info["grad_penalty"])

        return

    def _update_disc(self):
        info = None

        num_procs = mpi_util.get_num_procs()
        local_expert_batch_size = int(np.ceil(self._disc_batchsize /
                                              num_procs))
        local_agent_batch_size = local_expert_batch_size

        steps_per_batch = self._disc_steps_per_batch
        local_sample_count = self.replay_buffer.get_current_size()
        global_sample_count = int(mpi_util.reduce_sum(local_sample_count))

        num_steps = int(
            np.ceil(steps_per_batch * global_sample_count /
                    (num_procs * local_expert_batch_size)))

        for b in range(num_steps):
            disc_expert_batch = self._disc_expert_buffer.sample(
                local_expert_batch_size)
            obs_expert = self._disc_expert_buffer.get(disc_expert_batch)

            disc_agent_batch = self._disc_agent_buffer.sample(
                local_agent_batch_size)
            obs_agent = self._disc_agent_buffer.get(disc_agent_batch)

            curr_info = self._step_disc(obs_expert=obs_expert,
                                        obs_agent=obs_agent)

            if (info is None):
                info = curr_info
            else:
                for k, v in curr_info.items():
                    info[k] += v

        for k in info.keys():
            info[k] /= num_steps

        return info

    def _step_disc(self, obs_expert, obs_agent):
        feed = {
            self._amp_obs_expert_ph: obs_expert,
            self._amp_obs_agent_ph: obs_agent,
        }

        run_tfs = [
            self._disc_grad_tf, self._disc_loss_tf, self._acc_expert_tf,
            self._acc_agent_tf, self._avg_prob_agent_tf,
            self._abs_logit_agent_tf, self._grad_penalty_loss_tf
        ]
        results = self.sess.run(run_tfs, feed)

        grads = results[0]
        self._disc_solver.update(grads)

        info = {
            "loss": results[1],
            "acc_expert": results[2],
            "acc_agent": results[3],
            "prob_agent": results[4],
            "abs_logit": results[5],
            "grad_penalty": results[6],
        }
        return info

    def get_disc_stepsize(self):
        return self._disc_solver.get_stepsize()

    def get_disc_steps(self):
        return self._disc_solver.iter

    def _enable_grad_penalty(self):
        return self._grad_penalty_loss_tf.op.type != "Const"

    def _fetch_batch_rewards(self, start_idx, end_idx):
        idx = np.array(list(range(start_idx, end_idx)))
        rewards = self._batch_calc_reward(idx)
        return rewards

    def _batch_calc_reward(self, idx):
        obs_agent = self.replay_buffer.get("amp_obs_agent", idx)
        disc_r, _ = self._calc_disc_reward(obs_agent)

        end_mask = self.replay_buffer.is_path_end(idx)
        valid_mask = np.logical_not(end_mask)

        disc_r *= self._reward_scale
        valid_disc_r = disc_r[valid_mask]
        self._disc_reward_mean = np.mean(valid_disc_r)
        self._disc_reward_std = np.std(valid_disc_r)

        if (self._enable_amp_task_reward()):
            task_r = self.replay_buffer.get("rewards", idx)
            r = self._lerp_reward(disc_r, task_r)
        else:
            r = disc_r

        curr_reward_min = np.amin(r)
        curr_reward_max = np.amax(r)
        self._reward_min = np.minimum(self._reward_min, curr_reward_min)
        self._reward_max = np.maximum(self._reward_max, curr_reward_max)
        reward_data = np.array([self._reward_min, -self._reward_max])
        reward_data = mpi_util.reduce_min(reward_data)

        self._reward_min = reward_data[0]
        self._reward_max = -reward_data[1]

        return r

    def _lerp_reward(self, disc_r, task_r):
        r = (1.0 -
             self._task_reward_lerp) * disc_r + self._task_reward_lerp * task_r
        return r

    def _calc_disc_reward(self, amp_obs):
        feed = {
            self._amp_obs_agent_ph: amp_obs,
        }
        logits = self.sess.run(self._disc_logits_agent_tf, feed_dict=feed)
        r = self._logits_to_reward(logits)
        r = r[:, 0]
        return r, logits

    def _compute_batch_vals(self, start_idx, end_idx):
        states = self.replay_buffer.get_all("states")[start_idx:end_idx]
        goals = self.replay_buffer.get_all(
            "goals")[start_idx:end_idx] if self.has_goal() else None
        vals = self._eval_critic(states, goals)

        val_min = self._reward_min / (1.0 - self.discount)
        val_max = self._reward_max / (1.0 - self.discount)
        vals = np.clip(vals, val_min, val_max)

        idx = np.array(list(range(start_idx, end_idx)))
        is_end = self.replay_buffer.is_path_end(idx)
        is_fail = self.replay_buffer.check_terminal_flag(
            idx, Env.Terminate.Fail)
        is_succ = self.replay_buffer.check_terminal_flag(
            idx, Env.Terminate.Succ)
        is_fail = np.logical_and(is_end, is_fail)
        is_succ = np.logical_and(is_end, is_succ)
        vals[is_fail] = self.val_fail
        vals[is_succ] = self.val_succ

        return vals
Beispiel #7
0
class PGAgent(TFAgent):
    NAME = 'PG'

    ACTOR_NET_KEY = 'ActorNet'
    ACTOR_STEPSIZE_KEY = 'ActorStepsize'
    ACTOR_MOMENTUM_KEY = 'ActorMomentum'
    ACTOR_WEIGHT_DECAY_KEY = 'ActorWeightDecay'
    ACTOR_INIT_OUTPUT_SCALE_KEY = 'ActorInitOutputScale'

    CRITIC_NET_KEY = 'CriticNet'
    CRITIC_STEPSIZE_KEY = 'CriticStepsize'
    CRITIC_MOMENTUM_KEY = 'CriticMomentum'
    CRITIC_WEIGHT_DECAY_KEY = 'CriticWeightDecay'
    
    EXP_ACTION_FLAG = 1 << 0

    def __init__(self, world, id, json_data): 
        self._exp_action = False
        super().__init__(world, id, json_data)
        return

    def reset(self):
        super().reset()
        self._exp_action = False
        return

    def _check_action_space(self):
        action_space = self.get_action_space()
        return action_space == ActionSpace.Continuous

    def _load_params(self, json_data):
        super()._load_params(json_data)
        self.val_min, self.val_max = self._calc_val_bounds(self.discount)
        self.val_fail, self.val_succ = self._calc_term_vals(self.discount)
        return

    def _build_nets(self, json_data):
        assert self.ACTOR_NET_KEY in json_data
        assert self.CRITIC_NET_KEY in json_data

        actor_net_name = json_data[self.ACTOR_NET_KEY]
        critic_net_name = json_data[self.CRITIC_NET_KEY]
        actor_init_output_scale = 1 if (self.ACTOR_INIT_OUTPUT_SCALE_KEY not in json_data) else json_data[self.ACTOR_INIT_OUTPUT_SCALE_KEY]
        
        s_size = self.get_state_size()
        g_size = self.get_goal_size()
        a_size = self.get_action_size()

        # setup input tensors
        self.s_tf = tf.placeholder(tf.float32, shape=[None, s_size], name="s") # observations
        self.tar_val_tf = tf.placeholder(tf.float32, shape=[None], name="tar_val") # target value s
        self.adv_tf = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage
        self.a_tf = tf.placeholder(tf.float32, shape=[None, a_size], name="a") # target actions
        self.g_tf = tf.placeholder(tf.float32, shape=([None, g_size] if self.has_goal() else None), name="g") # goals

        with tf.variable_scope('main'):
            with tf.variable_scope('actor'):
                self.actor_tf = self._build_net_actor(actor_net_name, actor_init_output_scale)
            with tf.variable_scope('critic'):
                self.critic_tf = self._build_net_critic(critic_net_name)

        if (self.actor_tf != None):
            Logger.print2('Built actor net: ' + actor_net_name)

        if (self.critic_tf != None):
            Logger.print2('Built critic net: ' + critic_net_name)

        return

    def _build_normalizers(self):
        super()._build_normalizers()
        with self.sess.as_default(), self.graph.as_default(), tf.variable_scope(self.tf_scope):
            with tf.variable_scope(self.RESOURCE_SCOPE):
                val_offset, val_scale = self._calc_val_offset_scale(self.discount)
                self.val_norm = TFNormalizer(self.sess, 'val_norm', 1)
                self.val_norm.set_mean_std(-val_offset, 1.0 / val_scale)
        return

    def _init_normalizers(self):
        super()._init_normalizers()
        with self.sess.as_default(), self.graph.as_default():
            self.val_norm.update()
        return

    def _load_normalizers(self):
        super()._load_normalizers()
        self.val_norm.load()
        return

    def _build_losses(self, json_data):
        actor_weight_decay = 0 if (self.ACTOR_WEIGHT_DECAY_KEY not in json_data) else json_data[self.ACTOR_WEIGHT_DECAY_KEY]
        critic_weight_decay = 0 if (self.CRITIC_WEIGHT_DECAY_KEY not in json_data) else json_data[self.CRITIC_WEIGHT_DECAY_KEY]

        norm_val_diff = self.val_norm.normalize_tf(self.tar_val_tf) - self.val_norm.normalize_tf(self.critic_tf)
        self.critic_loss_tf = 0.5 * tf.reduce_mean(tf.square(norm_val_diff))

        if (critic_weight_decay != 0):
            self.critic_loss_tf += critic_weight_decay * self._weight_decay_loss('main/critic')
        
        norm_a_mean_tf = self.a_norm.normalize_tf(self.actor_tf)
        norm_a_diff = self.a_norm.normalize_tf(self.a_tf) - norm_a_mean_tf

        self.actor_loss_tf = tf.reduce_sum(tf.square(norm_a_diff), axis=-1)
        self.actor_loss_tf *= self.adv_tf
        self.actor_loss_tf = 0.5 * tf.reduce_mean(self.actor_loss_tf)

        norm_a_bound_min = self.a_norm.normalize(self.a_bound_min)
        norm_a_bound_max = self.a_norm.normalize(self.a_bound_max)
        a_bound_loss = TFUtil.calc_bound_loss(norm_a_mean_tf, norm_a_bound_min, norm_a_bound_max)
        a_bound_loss /= self.exp_params_curr.noise
        self.actor_loss_tf += a_bound_loss

        if (actor_weight_decay != 0):
            self.actor_loss_tf += actor_weight_decay * self._weight_decay_loss('main/actor')
        
        return

    def _build_solvers(self, json_data):
        actor_stepsize = 0.001 if (self.ACTOR_STEPSIZE_KEY not in json_data) else json_data[self.ACTOR_STEPSIZE_KEY]
        actor_momentum = 0.9 if (self.ACTOR_MOMENTUM_KEY not in json_data) else json_data[self.ACTOR_MOMENTUM_KEY]
        critic_stepsize = 0.01 if (self.CRITIC_STEPSIZE_KEY not in json_data) else json_data[self.CRITIC_STEPSIZE_KEY]
        critic_momentum = 0.9 if (self.CRITIC_MOMENTUM_KEY not in json_data) else json_data[self.CRITIC_MOMENTUM_KEY]
        
        critic_vars = self._tf_vars('main/critic')
        critic_opt = tf.train.MomentumOptimizer(learning_rate=critic_stepsize, momentum=critic_momentum)
        self.critic_grad_tf = tf.gradients(self.critic_loss_tf, critic_vars)
        self.critic_solver = MPISolver(self.sess, critic_opt, critic_vars)

        actor_vars = self._tf_vars('main/actor')
        actor_opt = tf.train.MomentumOptimizer(learning_rate=actor_stepsize, momentum=actor_momentum)
        self.actor_grad_tf = tf.gradients(self.actor_loss_tf, actor_vars)
        self.actor_solver = MPISolver(self.sess, actor_opt, actor_vars)

        return

    def _build_net_actor(self, net_name, init_output_scale):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]
        
        h = NetBuilder.build_net(net_name, input_tfs)
        norm_a_tf = tf.layers.dense(inputs=h, units=self.get_action_size(), activation=None,
                                kernel_initializer=tf.random_uniform_initializer(minval=-init_output_scale, maxval=init_output_scale))
        
        a_tf = self.a_norm.unnormalize_tf(norm_a_tf)
        return a_tf
    
    def _build_net_critic(self, net_name):
        norm_s_tf = self.s_norm.normalize_tf(self.s_tf)
        input_tfs = [norm_s_tf]
        if (self.has_goal()):
            norm_g_tf = self.g_norm.normalize_tf(self.g_tf)
            input_tfs += [norm_g_tf]
        
        h = NetBuilder.build_net(net_name, input_tfs)
        norm_val_tf = tf.layers.dense(inputs=h, units=1, activation=None,
                                kernel_initializer=TFUtil.xavier_initializer);

        norm_val_tf = tf.reshape(norm_val_tf, [-1])
        val_tf = self.val_norm.unnormalize_tf(norm_val_tf)
        return val_tf

    def _initialize_vars(self):
        super()._initialize_vars()
        self._sync_solvers()
        return

    def _sync_solvers(self):
        self.actor_solver.sync()
        self.critic_solver.sync()
        return

    def _decide_action(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            self._exp_action = False
            a = self._eval_actor(s, g)[0]
            logp = 0

            if self._enable_stoch_policy():
                # epsilon-greedy
                rand_action = MathUtil.flip_coin(self.exp_params_curr.rate)
                if rand_action:
                    norm_exp_noise = np.random.randn(*a.shape)
                    norm_exp_noise *= self.exp_params_curr.noise
                    exp_noise = norm_exp_noise * self.a_norm.std
                    a += exp_noise

                    logp = self._calc_action_logp(norm_exp_noise)
                    self._exp_action = True

        return a, logp

    def _enable_stoch_policy(self):
        return self.enable_training and (self._mode == self.Mode.TRAIN or self._mode == self.Mode.TRAIN_END)

    def _eval_actor(self, s, g):
        s = np.reshape(s, [-1, self.get_state_size()])
        g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None
          
        feed = {
            self.s_tf : s,
            self.g_tf : g
        }

        a = self.actor_tf.eval(feed)
        return a
    
    def _eval_critic(self, s, g):
        with self.sess.as_default(), self.graph.as_default():
            s = np.reshape(s, [-1, self.get_state_size()])
            g = np.reshape(g, [-1, self.get_goal_size()]) if self.has_goal() else None

            feed = {
                self.s_tf : s,
                self.g_tf : g
            }

            val = self.critic_tf.eval(feed)    
        return val

    def _record_flags(self):
        flags = int(0)
        if (self._exp_action):
            flags = flags | self.EXP_ACTION_FLAG
        return flags

    def _train_step(self):
        super()._train_step()

        critic_loss = self._update_critic()
        actor_loss = self._update_actor()
        critic_loss = MPIUtil.reduce_avg(critic_loss)
        actor_loss = MPIUtil.reduce_avg(actor_loss)

        critic_stepsize = self.critic_solver.get_stepsize()
        actor_stepsize = self.actor_solver.get_stepsize()
        
        self.logger.log_tabular('Critic_Loss', critic_loss)
        self.logger.log_tabular('Critic_Stepsize', critic_stepsize)
        self.logger.log_tabular('Actor_Loss', actor_loss) 
        self.logger.log_tabular('Actor_Stepsize', actor_stepsize)

        return

    def _update_critic(self):
        idx = self.replay_buffer.sample(self._local_mini_batch_size)
        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if self.has_goal() else None
        
        tar_V = self._calc_updated_vals(idx)
        tar_V = np.clip(tar_V, self.val_min, self.val_max)

        feed = {
            self.s_tf: s,
            self.g_tf: g,
            self.tar_val_tf: tar_V
        }

        loss, grads = self.sess.run([self.critic_loss_tf, self.critic_grad_tf], feed)
        self.critic_solver.update(grads)
        return loss
    
    def _update_actor(self):
        key = self.EXP_ACTION_FLAG
        idx = self.replay_buffer.sample_filtered(self._local_mini_batch_size, key)
        has_goal = self.has_goal()

        s = self.replay_buffer.get('states', idx)
        g = self.replay_buffer.get('goals', idx) if has_goal else None
        a = self.replay_buffer.get('actions', idx)

        V_new = self._calc_updated_vals(idx)
        V_old = self._eval_critic(s, g)
        adv = V_new - V_old

        feed = {
            self.s_tf: s,
            self.g_tf: g,
            self.a_tf: a,
            self.adv_tf: adv
        }

        loss, grads = self.sess.run([self.actor_loss_tf, self.actor_grad_tf], feed)
        self.actor_solver.update(grads)

        return loss

    def _calc_updated_vals(self, idx):
        r = self.replay_buffer.get('rewards', idx)

        if self.discount == 0:
            new_V = r
        else:
            next_idx = self.replay_buffer.get_next_idx(idx)
            s_next = self.replay_buffer.get('states', next_idx)
            g_next = self.replay_buffer.get('goals', next_idx) if self.has_goal() else None

            is_end = self.replay_buffer.is_path_end(idx)
            is_fail = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Fail)
            is_succ = self.replay_buffer.check_terminal_flag(idx, Env.Terminate.Succ)
            is_fail = np.logical_and(is_end, is_fail) 
            is_succ = np.logical_and(is_end, is_succ) 

            V_next = self._eval_critic(s_next, g_next)
            V_next[is_fail] = self.val_fail
            V_next[is_succ] = self.val_succ

            new_V = r + self.discount * V_next
        return new_V

    def _calc_action_logp(self, norm_action_deltas):
        # norm action delta are for the normalized actions (scaled by self.a_norm.std) 
        stdev = self.exp_params_curr.noise
        assert stdev > 0

        a_size = self.get_action_size()
        logp = -0.5 / (stdev * stdev) * np.sum(np.square(norm_action_deltas), axis=-1)
        logp += -0.5 * a_size * np.log(2 * np.pi)
        logp += -a_size * np.log(stdev)
        return logp

    def _log_val(self, s, g):
        val = self._eval_critic(s, g)
        norm_val = self.val_norm.normalize(val)
        self.world.env.log_val(self.id, norm_val[0])
        return

    def _build_replay_buffer(self, buffer_size):
        super()._build_replay_buffer(buffer_size)
        self.replay_buffer.add_filter_key(self.EXP_ACTION_FLAG)
        return