コード例 #1
0
ファイル: qmix_tf.py プロジェクト: shishouyuan/xingtian
class QMixModel(object):
    """Define QMix model with tensorflow.graph."""

    def __init__(self, model_info):
        """
        Update default model.parameters with model info.

        owing to the big graph contains five sub-graph, while,
        explorer could work well with the explore.graph,
        Based on the least-cost principle,
        explorer could init the explore.graph;
        and, train process init the train.graph.
        """
        logging.debug("init qmix model with:\n{}".format(model_info))
        model_config = model_info.get("model_config", None)

        self.model_config = model_config

        self.graph = tf.Graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config, graph=self.graph)
        self.sess = sess

        # start to fetch parameters
        self.gamma = model_config.get("gamma", 0.99)
        self.lr = model_config.get("lr", 0.0005)
        self.grad_norm_clip = model_config.get("grad_norm_clip", 10)

        self.n_agents = model_config["n_agents"]
        self.obs_shape = model_config["obs_shape"]
        self.rnn_hidden_dim = model_config["rnn_hidden_dim"]

        seq_limit = model_config["episode_limit"]
        self.fix_seq_length = seq_limit  # use the episode limit as fix shape.

        self.n_actions = model_config["n_actions"]

        self.batch_size = model_config["batch_size"]
        self.avail_action_num = model_config["n_actions"]
        self.state_dim = int(np.prod(model_config["state_shape"]))
        self.embed_dim = model_config["mixing_embed_dim"]

        self.use_double_q = model_config.get("use_double_q", True)
        # fetch parameters from configure ready

        with self.graph.as_default():
            # placeholder work with tf.sess.run
            # buffer for explore
            # note: 4-d make same significance with train operation !
            self.ph_obs = tf.placeholder(
                tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs")

            self.ph_hidden_states_in = tf.placeholder(
                tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in")
            self.agent_outs, self.hidden_outs = None, None
            self._explore_paras = None
            self.gru_cell = None
            self.hi_out_val = None

            # placeholder for train
            self.ph_avail_action = tf.placeholder(
                tf.float32,
                shape=[
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.avail_action_num,
                ],
                name="avail_action",
            )

            self.ph_actions = tf.placeholder(
                tf.float32,
                shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1],
                name="actions",
            )

            self.ph_train_obs = tf.placeholder(
                tf.float32,
                shape=(
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.obs_shape,
                ),
                name="train_obs",
            )
            self.ph_train_obs_len = tf.placeholder(
                tf.float32, shape=(None, ), name="train_obs_len")

            # eval mixer ---------------
            self.ph_train_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_stats",
            )
            # target mixer -------------------
            self.ph_train_target_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_target_stats",
            )

            self.q_tot, self.target_q_tot = None, None

            self.ph_rewards = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="rewards",
            )
            self.ph_terminated = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="terminated",
            )
            self.ph_mask = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="mask",
            )

            self.loss, self.grad_update = None, None

            # graph weights update
            self.agent_train_replace_op = None
            self.agent_explore_replace_op = None
            self.mix_train_replace_op = None

        # init graph
        self.g_type = model_info.get("scene", "explore")

        self.build_actor_graph()  # NOTE: build actor always
        if self.g_type == "train":
            self.build_train_graph()

        # note: init with only once are importance!
        with self.graph.as_default():
            self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess)

            self.sess.run(tf.global_variables_initializer())
            self.hi_out_val_default = self.sess.run(
                self.gru_cell.zero_state(self.n_agents, dtype=tf.float32))

            # max_to_keep = 5 default, may been remove when to evaluate
            self.explore_saver = tf.train.Saver({
                t.name: t for t in self._explore_paras}, max_to_keep=100,)

    def build_actor_graph(self):
        """Build explorer graph with minimum principle."""
        with self.graph.as_default():
            with tf.variable_scope("explore_agent"):
                self.agent_outs, self.hidden_outs = self.build_agent_net(
                    inputs_obs=self.ph_obs,
                    seq_max=1,  # 1, importance for inference
                    obs_lengths=[1 for _ in range(self.n_agents)],
                    hidden_state_in=self.ph_hidden_states_in,
                )

            self._explore_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

    def build_agent_net(self, inputs_obs, seq_max, obs_lengths, hidden_state_in):
        """
        Build agent architecture.

        could work well among explorer & train with different sequence.
        """
        fc1 = tf.layers.dense(
            inputs=inputs_obs,
            units=self.rnn_hidden_dim,
            activation=tf.nn.relu,
        )

        fc1 = tf.transpose(fc1, perm=[0, 2, 1, 3])
        logging.debug("fc1 before reshape: {}".format(fc1))
        fc1 = tf.reshape(fc1, [-1, seq_max, self.rnn_hidden_dim])
        logging.debug("fc1 after reshape: {}".format(fc1))

        gru_cell = tf.nn.rnn_cell.GRUCell(
            num_units=self.rnn_hidden_dim,  # dtype=self.dtype
        )
        # only record the gru cell once time, to init the hidden value.
        if not self.gru_cell:
            self.gru_cell = gru_cell

        # tf.nn.dynamic_rnn could be work well with different-length sequence
        rnn_output, hidden_state_out = tf.nn.dynamic_rnn(
            gru_cell,
            fc1,
            dtype=tf.float32,
            initial_state=hidden_state_in,
            sequence_length=obs_lengths,
        )

        logging.debug("rnn raw out: {} ".format(rnn_output))
        rnn_output = tf.reshape(
            rnn_output, [-1, self.n_agents, seq_max, self.rnn_hidden_dim])
        rnn_output = tf.transpose(rnn_output, perm=[0, 2, 1, 3])

        rnn_output = tf.reshape(rnn_output, [-1, self.rnn_hidden_dim])

        fc2_outputs = tf.layers.dense(
            inputs=rnn_output,
            units=self.n_actions,
            activation=None,
        )

        out_actions = tf.reshape(
            fc2_outputs, (-1, self.n_agents, self.avail_action_num))
        logging.debug("out action: {}".format(out_actions))

        return out_actions, hidden_state_out

    def reset_hidden_state(self):
        """Reset hidden state with value assign."""
        self.hi_out_val = self.hi_out_val_default

    def infer_actions(self, agent_inputs):
        """Unify inference api."""
        out_val, self.hi_out_val = self.sess.run(
            [self.agent_outs, self.hidden_outs],
            feed_dict={
                self.ph_obs: agent_inputs,
                self.ph_hidden_states_in: self.hi_out_val,
            },
        )
        return out_val

    def gather_custom(self, inputs, indices):
        indices = tf.cast(indices, tf.uint8)
        one_hot = tf.squeeze(
            tf.one_hot(indices=indices, depth=self.n_actions, on_value=1.,
                       off_value=0., axis=-1, dtype=tf.float32),
            axis=-2)
        mul_test = tf.multiply(inputs, one_hot)
        # reduce_sum_val = tf.reduce_sum(mul_test, axis=-1, keep_dims=True)
        reduce_sum_val = tf.reduce_sum(mul_test, axis=-1)
        return reduce_sum_val

    def _build_mix_net2(self, agent_qs, states):
        hypernet_embed = self.model_config["hypernet_embed"]

        def hyper_w1(hyper_w1_input):
            """
            Create hyper_w1.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w1"):
                hw0 = tf.layers.dense(inputs=hyper_w1_input,
                                      units=hypernet_embed,
                                      activation=tf.nn.relu)
                hw1 = tf.layers.dense(inputs=hw0,
                                      units=self.embed_dim * self.n_agents,
                                      activation=None)
                return hw1

        def hyper_w_final(hyper_w_final_input):
            """
            Create hyper_w_final.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w_final"):
                hw_f0 = tf.layers.dense(
                    inputs=hyper_w_final_input,
                    units=hypernet_embed,
                    activation=tf.nn.relu,
                )
                hw_f1 = tf.layers.dense(inputs=hw_f0,
                                        units=self.embed_dim,
                                        activation=None)
                return hw_f1

        def hyper_b1(state_input):
            """State dependent bias for hidden layer."""
            with tf.variable_scope("hyper_b1"):
                return tf.layers.dense(inputs=state_input,
                                       units=self.embed_dim,
                                       activation=None)

        def val(state_input):
            """V(s) instead of a bias for the last layers."""
            with tf.variable_scope("val_for_bias"):
                val0 = tf.layers.dense(inputs=state_input,
                                       units=self.embed_dim,
                                       activation=tf.nn.relu)
                val2 = tf.layers.dense(inputs=val0, units=1, activation=None)
                return val2

        bs = agent_qs.get_shape().as_list()[0]
        states_reshaped = tf.reshape(states, (-1, self.state_dim))
        agent_qs_reshaped = tf.reshape(agent_qs, (-1, 1, self.n_agents))

        # firstly layer
        w1 = tf.math.abs(hyper_w1(states_reshaped))
        b1 = hyper_b1(states_reshaped)

        w1_reshaped = tf.reshape(w1, (-1, self.n_agents, self.embed_dim))
        b1_reshaped = tf.reshape(b1, (-1, 1, self.embed_dim))

        to_hidden_val = tf.math.add(
            tf.matmul(agent_qs_reshaped, w1_reshaped), b1_reshaped)
        hidden = tf.nn.elu(to_hidden_val)

        # second layer
        w_final = tf.math.abs(hyper_w_final(states_reshaped))
        w_final_reshaped = tf.reshape(w_final, (-1, self.embed_dim, 1))

        # state-dependent bias
        v = tf.reshape(val(states_reshaped), (-1, 1, 1))

        # compute final output
        y = tf.math.add(tf.matmul(hidden, w_final_reshaped), v)

        # reshape and return
        q_tot = tf.reshape(y, (bs, -1, 1))

        return q_tot

    @staticmethod
    def _print_trainable_var_name(**kwargs):
        """Print trainable variable name."""
        for k, v in kwargs.items():
            logging.info("{}: \n {}".format(k, list([t.name for t in v])))

    def build_train_graph(self):
        """
        Build train graph.

        Because of the different seq_max(1 vs limit),
        train graph cannot connect-up to actor.graph directly.
        Hence, we build an explore sub-graph and train sub-graph,
        which sync with tf.assign between two collections.
        :return:
        """
        with self.graph.as_default():
            with tf.variable_scope("eval_agent"):
                trajectory_agent_outs, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    seq_max=self.fix_seq_length + 1,  # importance
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,  # total trajectory, needn't hold hidden
                )

            with tf.variable_scope("target_agent"):
                tar_agent_outs_tmp, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    # fix value, different between explore and train
                    seq_max=self.fix_seq_length + 1,
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,
                )
                target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp)

            _eval_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent")
            _target_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent")

            with tf.variable_scope("soft_replacement"):
                self.agent_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_agent_paras,
                                                    _eval_agent_paras)]

                self.agent_explore_replace_op = [
                    tf.assign(t, e) for t, e in zip(self._explore_paras,
                                                    _eval_agent_paras)
                ]

            self._print_trainable_var_name(
                _eval_agent_paras=_eval_agent_paras,
                _target_agent_paras=_target_agent_paras,
                _explore_paras=self._explore_paras,
            )

            # agent out to max q values
            # Calculate estimated Q-Values ----------------
            mac_out = tf.reshape(
                trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            logging.debug("mac_out: {}".format(mac_out))
            chosen_action_qvals = self.gather_custom(mac_out[:, :-1],
                                                     self.ph_actions)

            # Calculate the Q-Values necessary for the target -----------
            target_mac_out = tf.reshape(
                target_trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            target_mac_out = target_mac_out[:, 1:]

            # Mask out unavailable actions
            # target_mac_out[avail_actions[:, 1:] == 0] = -9999999
            indices = tf.equal(self.ph_avail_action[:, 1:], 0)
            mask_val = tf.tile(
                [[[[-999999.0]]]],
                [
                    self.batch_size,
                    self.fix_seq_length,
                    self.n_agents,
                    self.avail_action_num,
                ],
            )
            logging.debug("indices:{}, mask_val:{}, target mac out:{}".format(
                indices, mask_val, target_mac_out))

            target_mac_out = tf.where(indices, mask_val, target_mac_out)

            if self.use_double_q:
                # Get actions that maximise live Q (for double q-learning)
                mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:]))
                mac_out_detach = tf.where(indices, mask_val, mac_out_detach)
                cur_max_actions = tf.expand_dims(
                    tf.argmax(mac_out_detach, axis=-1), -1)
                target_max_qvals = self.gather_custom(target_mac_out,
                                                      cur_max_actions)
            else:
                target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1])

            # eval mixer ---------------
            with tf.variable_scope("eval_mixer"):
                self.q_tot = self._build_mix_net2(chosen_action_qvals,
                                                  self.ph_train_states)

            with tf.variable_scope("target_mixer"):
                q_tot_tmp = self._build_mix_net2(target_max_qvals,
                                                 self.ph_train_target_states)
                self.target_q_tot = tf.stop_gradient(q_tot_tmp)

            _eval_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer")
            _target_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer")

            with tf.variable_scope("soft_replacement"):
                self.mix_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_mix_paras,
                                                    _eval_mix_paras)]

            self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras,
                                           _target_mix_paras=_target_mix_paras)

            # Calculate 1-step Q-Learning targets
            targets = (self.ph_rewards +
                       self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot)

            # Td-error
            td_error = self.q_tot - tf.stop_gradient(targets)

            # mask = mask.expand_as(td_error)  #fixme: default as same shape!

            # 0-out the targets that came from padded data
            masked_td_error = tf.multiply(td_error, self.ph_mask)

            self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask)

            # Optimise
            optimizer = tf.train.RMSPropOptimizer(
                self.lr, decay=0.95, epsilon=1.5e-7, centered=True)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            capped_gvs = [(
                grad if grad is None else tf.clip_by_norm(
                    grad, clip_norm=self.grad_norm_clip),
                var,
            ) for grad, var in grads_and_vars]
            self.grad_update = optimizer.apply_gradients(capped_gvs)

    def assign_targets(self):
        """
        Update weights periodically.

        1. from eval agent to target agent
        2. from target mixer to eval mixer
        :return:
        """
        _a, _m = self.sess.run([self.agent_train_replace_op,
                                self.mix_train_replace_op])

    def assign_explore_agent(self):
        """
        Update explore agent after each train process.

        :return:
        """
        _ = self.sess.run(self.agent_explore_replace_op)

    def save_explore_agent_weights(self, save_path):
        """Save explore agent weight for explorer."""
        # explore_saver = tf.train.Saver({t.name: t for t in self._explore_paras})
        self.explore_saver.save(
            self.sess, save_path=save_path, write_meta_graph=False)
        # tf.train.list_variables(tf.train.latest_checkpoint(wp))

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        with self.graph.as_default():
            self.actor_var.set_weights(weights)

    def get_weights(self):
        """Get the weights."""
        with self.graph.as_default():
            return self.actor_var.get_weights()

    def restore_explorer_variable(self, model_name):
        """Restore explorer variable with tf.train.checkpoint."""
        reader = tf.train.NewCheckpointReader(model_name)
        var_names = reader.get_variable_to_shape_map().keys()
        result = {}
        for n in var_names:
            result[n] = reader.get_tensor(n)
            logging.debug("read variable-{} from file:{}".format(n, model_name))
        with self.sess.as_default():  # must been sess
            for var_key in self._explore_paras:
                try:
                    var_key.load(result[var_key.name])
                    logging.debug("load {} success".format(var_key.name))
                except BaseException as err:
                    raise KeyError("update {} error:{}".format(var_key.name, err))

    def train(
            self,
            batch_trajectories,
            train_obs_len,
            avail_actions,
            actions,
            cur_stats,
            target_stats,
            rewards,
            terminated,
            mask):
        """
        Train with the whole graph.

        Update explorer.graph after each train process, and target as required.

        :param batch_trajectories:
        :param train_obs_len: list([max_ep for _ in range(batch.batch_size * n_agents)]
        :param avail_actions: avail action from environment
        :param actions: actual actions within trajectory
        :param cur_stats: batch["state"][:, :-1]
        :param target_stats: batch["state"][:, 1:]
        :param rewards:
        :param terminated:
        :param mask:
        :return:
        """
        _, loss_val = self.sess.run(
            [self.grad_update, self.loss],
            feed_dict={
                self.ph_train_obs: batch_trajectories,
                # Note: split trajectory with each agent.
                self.ph_train_obs_len: train_obs_len,
                self.ph_avail_action: avail_actions,
                self.ph_actions: actions,
                self.ph_train_states: cur_stats,
                self.ph_train_target_states: target_stats,
                self.ph_rewards: rewards,
                self.ph_terminated: terminated,
                self.ph_mask: mask,
            },
        )
        logging.debug("train_loss: {}".format(loss_val))
        return loss_val
コード例 #2
0
class ImpalaCnnOpt(XTModel):
    """Docstring for ActorNetwork."""
    def __init__(self, model_info):
        model_config = model_info.get("model_config", dict())
        import_config(globals(), model_config)
        self.dtype = DTYPE_MAP.get(model_info.get("default_dtype", "float32"))
        self.input_dtype = model_info.get("input_dtype", "float32")
        self.sta_mean = model_info.get("state_mean", 0.)
        self.sta_std = model_info.get("state_std", 255.)

        self._transform = partial(state_transform,
                                  mean=self.sta_mean,
                                  std=self.sta_std,
                                  input_dtype=self.input_dtype)

        self.state_dim = model_info["state_dim"]
        self.action_dim = model_info["action_dim"]
        self.filter_arch = get_atari_filter(self.state_dim)

        # lr schedule with linear_cosine_decay
        self.lr_schedule = model_config.get("lr_schedule", None)
        self.opt_type = model_config.get("opt_type", "adam")
        self.lr = None

        self.ph_state = None
        self.ph_adv = None
        self.out_actions = None
        self.pi_logic_outs, self.baseline = None, None

        # placeholder for behavior policy logic outputs
        self.ph_bp_logic_outs = None
        self.ph_actions = None
        self.ph_dones = None
        self.ph_rewards = None
        self.loss, self.optimizer, self.train_op = None, None, None

        self.grad_norm_clip = model_config.get("grad_norm_clip", 40.0)
        self.sample_batch_steps = model_config.get("sample_batch_step", 50)

        self.saver = None
        self.explore_paras = None
        self.actor_var = None  # store weights for agent

        super().__init__(model_info)

    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True

    def _get_lr(self, global_step, decay_step=20000.):
        """Make decay learning rate."""
        lr_schedule = self.lr_schedule
        if len(lr_schedule) != 2:
            logging.warning("Need 2 elements in lr_schedule!\n, "
                            "likes [[0, 0.01], [20000, 0.000001]]")
            logging.fatal("lr_schedule invalid: {}".format(lr_schedule))

        if lr_schedule[0][0] != 0:
            logging.info("lr_schedule[0][1] could been init learning rate")

        learning_rate = linear_cosine_decay(lr_schedule[0][1],
                                            global_step,
                                            decay_step,
                                            beta=lr_schedule[1][1] /
                                            float(decay_step))

        return learning_rate

    def train(self, state, label):
        """Train with sess.run."""
        bp_logic_outs, actions, dones, rewards = label
        with self.graph.as_default():
            _, loss = self.sess.run(
                [self.train_op, self.loss],
                feed_dict={
                    self.ph_state: state,
                    self.ph_bp_logic_outs: bp_logic_outs,
                    self.ph_actions: actions,
                    self.ph_dones: dones,
                    self.ph_rewards: rewards,
                },
            )
        return loss

    def predict(self, state):
        """
        Do predict use the newest model.

        :param: state
        :return: action_logits, action_val, value
        """
        with self.graph.as_default():
            feed_dict = {self.ph_state: state}
            return self.sess.run(
                [self.pi_logic_outs, self.baseline, self.out_actions],
                feed_dict)

    def save_model(self, file_name):
        """Save model without meta graph."""
        ck_name = self.saver.save(self.sess,
                                  save_path=file_name,
                                  write_meta_graph=False)
        return ck_name

    def load_model(self, model_name, by_name=False):
        """Load model with inference variables."""
        restore_tf_variable(self.sess, self.explore_paras, model_name)

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        with self.graph.as_default():
            self.actor_var.set_weights(weights)

    def get_weights(self):
        """Get weights."""
        with self.graph.as_default():
            return self.actor_var.get_weights()