Example #1
0
    def create_model(self, model_info):
        """Create keras model."""
        state_input = Input(shape=self.state_dim, name='state_input')
        advantage = Input(shape=(1, ), name='adv')

        denselayer = Dense(HIDDEN_SIZE, activation='relu')(state_input)
        for _ in range(NUM_LAYERS - 1):
            denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer)

        out_actions = Dense(self.action_dim,
                            activation='softmax',
                            name='output_actions')(denselayer)  # y_pred
        out_value = Dense(1, name='output_value')(denselayer)
        model = Model(inputs=[state_input, advantage],
                      outputs=[out_actions, out_value])
        losses = {
            "output_actions": impala_loss(advantage),
            "output_value": 'mse'
        }
        lossweights = {"output_actions": 1.0, "output_value": .5}

        model.compile(optimizer=Adam(lr=LR),
                      loss=losses,
                      loss_weights=lossweights)

        self.infer_state = tf.placeholder(tf.float32,
                                          name="infer_state",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1))
        self.infer_p, self.infer_v = model([self.infer_state, self.adv])
        self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess)
        self.sess.run(tf.initialize_all_variables())

        return model
Example #2
0
    def create_model(self, model_info):
        """Create Deep-Q network."""
        state = Input(shape=self.state_dim)
        denselayer = Dense(HIDDEN_SIZE, activation='relu')(state)
        for _ in range(NUM_LAYERS - 1):
            denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer)

        value = Dense(self.action_dim, activation='linear')(denselayer)
        if self.dueling:
            adv = Dense(1, activation='linear')(denselayer)
            mean = Lambda(layer_normalize)(value)
            value = Lambda(layer_add)([adv, mean])

        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate)
        model.compile(loss='mse', optimizer=adam)

        self.infer_state = tf.placeholder(tf.float32,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
Example #3
0
    def create_model(self, model_info):
        """Create Deep-Q CNN network."""
        state = Input(shape=self.state_dim, dtype="uint8")
        state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state)
        convlayer = Conv2D(32, (8, 8),
                           strides=(4, 4),
                           activation='relu',
                           padding='valid')(state1)
        convlayer = Conv2D(64, (4, 4),
                           strides=(2, 2),
                           activation='relu',
                           padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3),
                           strides=(1, 1),
                           activation='relu',
                           padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu')(flattenlayer)
        value = Dense(self.action_dim, activation='linear')(denselayer)
        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate, clipnorm=10.)
        model.compile(loss='mse', optimizer=adam)
        if model_info.get("summary"):
            model.summary()

        self.infer_state = tf.placeholder(tf.uint8,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
Example #4
0
    def build_graph(self, input_type, model):
        # pylint: disable=W0201
        self.state_ph = tf.placeholder(input_type,
                                       name='state',
                                       shape=(None, *self.state_dim))
        self.old_logp_ph = tf.placeholder(tf.float32,
                                          name='old_log_p',
                                          shape=(None, 1))
        self.adv_ph = tf.placeholder(tf.float32,
                                     name='advantage',
                                     shape=(None, 1))
        self.old_v_ph = tf.placeholder(tf.float32,
                                       name='old_v',
                                       shape=(None, 1))
        self.target_v_ph = tf.placeholder(tf.float32,
                                          name='target_value',
                                          shape=(None, 1))

        pi_latent, self.out_v = model(self.state_ph)

        if self.action_type == 'Categorical':
            self.behavior_action_ph = tf.placeholder(tf.int32,
                                                     name='behavior_action',
                                                     shape=(None, ))
            dist_param = pi_latent
        elif self.action_type == 'DiagGaussian':
            # fixme: add input dependant log_std logic
            self.behavior_action_ph = tf.placeholder(tf.float32,
                                                     name='real_action',
                                                     shape=(None,
                                                            self.action_dim))
            log_std = tf.get_variable('pi_logstd',
                                      shape=(1, self.action_dim),
                                      initializer=tf.zeros_initializer())
            dist_param = tf.concat([pi_latent, pi_latent * 0.0 + log_std],
                                   axis=-1)
        else:
            raise NotImplementedError(
                'action type: {} not match any implemented distributions.'.
                format(self.action_type))

        self.dist.init_by_param(dist_param)
        self.action = self.dist.sample()
        self.action_log_prob = self.dist.log_prob(self.action)
        self.actor_var = TFVariables([self.action_log_prob, self.out_v],
                                     self.sess)

        self.actor_loss = actor_loss_with_entropy(self.dist, self.adv_ph,
                                                  self.old_logp_ph,
                                                  self.behavior_action_ph,
                                                  self.clip_ratio,
                                                  self.ent_coef)
        self.critic_loss = critic_loss(self.target_v_ph, self.out_v,
                                       self.old_v_ph, self.vf_clip)
        self.loss = self.actor_loss + self.critic_loss_coef * self.critic_loss
        self.train_op = self.build_train_op(self.loss)

        self.sess.run(tf.initialize_all_variables())
Example #5
0
    def create_model(self, model_info):
        state_input = Input(shape=self.state_dim,
                            name='state_input',
                            dtype='uint8')
        state_input_1 = Lambda(layer_function)(state_input)
        advantage = Input(shape=(1, ), name='adv')

        convlayer = Conv2D(32, (8, 8),
                           strides=(4, 4),
                           activation='relu',
                           padding='valid')(state_input_1)
        convlayer = Conv2D(64, (4, 4),
                           strides=(2, 2),
                           activation='relu',
                           padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3),
                           strides=(1, 1),
                           activation='relu',
                           padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu')(flattenlayer)

        out_actions = Dense(self.action_dim,
                            activation='softmax',
                            name='output_actions')(denselayer)
        out_value = Dense(1, name='output_value')(denselayer)
        model = Model(inputs=[state_input, advantage],
                      outputs=[out_actions, out_value])
        losses = {
            "output_actions": impala_loss(advantage),
            "output_value": 'mse'
        }
        lossweights = {"output_actions": 1.0, "output_value": .5}

        decay_value = 0.00000000512
        model.compile(optimizer=Adam(lr=LR, clipnorm=40., decay=decay_value),
                      loss=losses,
                      loss_weights=lossweights)

        self.infer_state = tf.placeholder(tf.uint8,
                                          name="infer_state",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1))
        self.infer_p, self.infer_v = model([self.infer_state, self.adv])

        self.actor_var = TFVariables([self.infer_p, self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())

        return model
Example #6
0
    def __init__(self, model_info):
        """
        Update default model.parameters with model info.

        owing to the big graph contains five sub-graph, while,
        explorer could work well with the explore.graph,
        Based on the least-cost principle,
        explorer could init the explore.graph;
        and, train process init the train.graph.
        """
        logging.debug("init qmix model with:\n{}".format(model_info))
        model_config = model_info.get("model_config", None)

        self.model_config = model_config

        self.graph = tf.Graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config, graph=self.graph)
        self.sess = sess

        # start to fetch parameters
        self.gamma = model_config.get("gamma", 0.99)
        self.lr = model_config.get("lr", 0.0005)
        self.grad_norm_clip = model_config.get("grad_norm_clip", 10)

        self.n_agents = model_config["n_agents"]
        self.obs_shape = model_config["obs_shape"]
        self.rnn_hidden_dim = model_config["rnn_hidden_dim"]

        seq_limit = model_config["episode_limit"]
        self.fix_seq_length = seq_limit  # use the episode limit as fix shape.

        self.n_actions = model_config["n_actions"]

        self.batch_size = model_config["batch_size"]
        self.avail_action_num = model_config["n_actions"]
        self.state_dim = int(np.prod(model_config["state_shape"]))
        self.embed_dim = model_config["mixing_embed_dim"]

        self.use_double_q = model_config.get("use_double_q", True)
        # fetch parameters from configure ready

        with self.graph.as_default():
            # placeholder work with tf.sess.run
            # buffer for explore
            # note: 4-d make same significance with train operation !
            self.ph_obs = tf.placeholder(
                tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs")

            self.ph_hidden_states_in = tf.placeholder(
                tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in")
            self.agent_outs, self.hidden_outs = None, None
            self._explore_paras = None
            self.gru_cell = None
            self.hi_out_val = None

            # placeholder for train
            self.ph_avail_action = tf.placeholder(
                tf.float32,
                shape=[
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.avail_action_num,
                ],
                name="avail_action",
            )

            self.ph_actions = tf.placeholder(
                tf.float32,
                shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1],
                name="actions",
            )

            self.ph_train_obs = tf.placeholder(
                tf.float32,
                shape=(
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.obs_shape,
                ),
                name="train_obs",
            )
            self.ph_train_obs_len = tf.placeholder(
                tf.float32, shape=(None, ), name="train_obs_len")

            # eval mixer ---------------
            self.ph_train_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_stats",
            )
            # target mixer -------------------
            self.ph_train_target_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_target_stats",
            )

            self.q_tot, self.target_q_tot = None, None

            self.ph_rewards = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="rewards",
            )
            self.ph_terminated = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="terminated",
            )
            self.ph_mask = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="mask",
            )

            self.loss, self.grad_update = None, None

            # graph weights update
            self.agent_train_replace_op = None
            self.agent_explore_replace_op = None
            self.mix_train_replace_op = None

        # init graph
        self.g_type = model_info.get("scene", "explore")

        self.build_actor_graph()  # NOTE: build actor always
        if self.g_type == "train":
            self.build_train_graph()

        # note: init with only once are importance!
        with self.graph.as_default():
            self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess)

            self.sess.run(tf.global_variables_initializer())
            self.hi_out_val_default = self.sess.run(
                self.gru_cell.zero_state(self.n_agents, dtype=tf.float32))

            # max_to_keep = 5 default, may been remove when to evaluate
            self.explore_saver = tf.train.Saver({
                t.name: t for t in self._explore_paras}, max_to_keep=100,)
Example #7
0
class QMixModel(object):
    """Define QMix model with tensorflow.graph."""

    def __init__(self, model_info):
        """
        Update default model.parameters with model info.

        owing to the big graph contains five sub-graph, while,
        explorer could work well with the explore.graph,
        Based on the least-cost principle,
        explorer could init the explore.graph;
        and, train process init the train.graph.
        """
        logging.debug("init qmix model with:\n{}".format(model_info))
        model_config = model_info.get("model_config", None)

        self.model_config = model_config

        self.graph = tf.Graph()
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        sess = tf.Session(config=config, graph=self.graph)
        self.sess = sess

        # start to fetch parameters
        self.gamma = model_config.get("gamma", 0.99)
        self.lr = model_config.get("lr", 0.0005)
        self.grad_norm_clip = model_config.get("grad_norm_clip", 10)

        self.n_agents = model_config["n_agents"]
        self.obs_shape = model_config["obs_shape"]
        self.rnn_hidden_dim = model_config["rnn_hidden_dim"]

        seq_limit = model_config["episode_limit"]
        self.fix_seq_length = seq_limit  # use the episode limit as fix shape.

        self.n_actions = model_config["n_actions"]

        self.batch_size = model_config["batch_size"]
        self.avail_action_num = model_config["n_actions"]
        self.state_dim = int(np.prod(model_config["state_shape"]))
        self.embed_dim = model_config["mixing_embed_dim"]

        self.use_double_q = model_config.get("use_double_q", True)
        # fetch parameters from configure ready

        with self.graph.as_default():
            # placeholder work with tf.sess.run
            # buffer for explore
            # note: 4-d make same significance with train operation !
            self.ph_obs = tf.placeholder(
                tf.float32, shape=(1, 1, self.n_agents, self.obs_shape), name="obs")

            self.ph_hidden_states_in = tf.placeholder(
                tf.float32, shape=(None, self.rnn_hidden_dim), name="hidden_in")
            self.agent_outs, self.hidden_outs = None, None
            self._explore_paras = None
            self.gru_cell = None
            self.hi_out_val = None

            # placeholder for train
            self.ph_avail_action = tf.placeholder(
                tf.float32,
                shape=[
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.avail_action_num,
                ],
                name="avail_action",
            )

            self.ph_actions = tf.placeholder(
                tf.float32,
                shape=[self.batch_size, self.fix_seq_length, self.n_agents, 1],
                name="actions",
            )

            self.ph_train_obs = tf.placeholder(
                tf.float32,
                shape=(
                    self.batch_size,
                    self.fix_seq_length + 1,
                    self.n_agents,
                    self.obs_shape,
                ),
                name="train_obs",
            )
            self.ph_train_obs_len = tf.placeholder(
                tf.float32, shape=(None, ), name="train_obs_len")

            # eval mixer ---------------
            self.ph_train_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_stats",
            )
            # target mixer -------------------
            self.ph_train_target_states = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, self.state_dim),
                name="train_target_stats",
            )

            self.q_tot, self.target_q_tot = None, None

            self.ph_rewards = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="rewards",
            )
            self.ph_terminated = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="terminated",
            )
            self.ph_mask = tf.placeholder(
                tf.float32,
                shape=(self.batch_size, self.fix_seq_length, 1),
                name="mask",
            )

            self.loss, self.grad_update = None, None

            # graph weights update
            self.agent_train_replace_op = None
            self.agent_explore_replace_op = None
            self.mix_train_replace_op = None

        # init graph
        self.g_type = model_info.get("scene", "explore")

        self.build_actor_graph()  # NOTE: build actor always
        if self.g_type == "train":
            self.build_train_graph()

        # note: init with only once are importance!
        with self.graph.as_default():
            self.actor_var = TFVariables([self.agent_outs, self.hidden_outs], self.sess)

            self.sess.run(tf.global_variables_initializer())
            self.hi_out_val_default = self.sess.run(
                self.gru_cell.zero_state(self.n_agents, dtype=tf.float32))

            # max_to_keep = 5 default, may been remove when to evaluate
            self.explore_saver = tf.train.Saver({
                t.name: t for t in self._explore_paras}, max_to_keep=100,)

    def build_actor_graph(self):
        """Build explorer graph with minimum principle."""
        with self.graph.as_default():
            with tf.variable_scope("explore_agent"):
                self.agent_outs, self.hidden_outs = self.build_agent_net(
                    inputs_obs=self.ph_obs,
                    seq_max=1,  # 1, importance for inference
                    obs_lengths=[1 for _ in range(self.n_agents)],
                    hidden_state_in=self.ph_hidden_states_in,
                )

            self._explore_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

    def build_agent_net(self, inputs_obs, seq_max, obs_lengths, hidden_state_in):
        """
        Build agent architecture.

        could work well among explorer & train with different sequence.
        """
        fc1 = tf.layers.dense(
            inputs=inputs_obs,
            units=self.rnn_hidden_dim,
            activation=tf.nn.relu,
        )

        fc1 = tf.transpose(fc1, perm=[0, 2, 1, 3])
        logging.debug("fc1 before reshape: {}".format(fc1))
        fc1 = tf.reshape(fc1, [-1, seq_max, self.rnn_hidden_dim])
        logging.debug("fc1 after reshape: {}".format(fc1))

        gru_cell = tf.nn.rnn_cell.GRUCell(
            num_units=self.rnn_hidden_dim,  # dtype=self.dtype
        )
        # only record the gru cell once time, to init the hidden value.
        if not self.gru_cell:
            self.gru_cell = gru_cell

        # tf.nn.dynamic_rnn could be work well with different-length sequence
        rnn_output, hidden_state_out = tf.nn.dynamic_rnn(
            gru_cell,
            fc1,
            dtype=tf.float32,
            initial_state=hidden_state_in,
            sequence_length=obs_lengths,
        )

        logging.debug("rnn raw out: {} ".format(rnn_output))
        rnn_output = tf.reshape(
            rnn_output, [-1, self.n_agents, seq_max, self.rnn_hidden_dim])
        rnn_output = tf.transpose(rnn_output, perm=[0, 2, 1, 3])

        rnn_output = tf.reshape(rnn_output, [-1, self.rnn_hidden_dim])

        fc2_outputs = tf.layers.dense(
            inputs=rnn_output,
            units=self.n_actions,
            activation=None,
        )

        out_actions = tf.reshape(
            fc2_outputs, (-1, self.n_agents, self.avail_action_num))
        logging.debug("out action: {}".format(out_actions))

        return out_actions, hidden_state_out

    def reset_hidden_state(self):
        """Reset hidden state with value assign."""
        self.hi_out_val = self.hi_out_val_default

    def infer_actions(self, agent_inputs):
        """Unify inference api."""
        out_val, self.hi_out_val = self.sess.run(
            [self.agent_outs, self.hidden_outs],
            feed_dict={
                self.ph_obs: agent_inputs,
                self.ph_hidden_states_in: self.hi_out_val,
            },
        )
        return out_val

    def gather_custom(self, inputs, indices):
        indices = tf.cast(indices, tf.uint8)
        one_hot = tf.squeeze(
            tf.one_hot(indices=indices, depth=self.n_actions, on_value=1.,
                       off_value=0., axis=-1, dtype=tf.float32),
            axis=-2)
        mul_test = tf.multiply(inputs, one_hot)
        # reduce_sum_val = tf.reduce_sum(mul_test, axis=-1, keep_dims=True)
        reduce_sum_val = tf.reduce_sum(mul_test, axis=-1)
        return reduce_sum_val

    def _build_mix_net2(self, agent_qs, states):
        hypernet_embed = self.model_config["hypernet_embed"]

        def hyper_w1(hyper_w1_input):
            """
            Create hyper_w1.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w1"):
                hw0 = tf.layers.dense(inputs=hyper_w1_input,
                                      units=hypernet_embed,
                                      activation=tf.nn.relu)
                hw1 = tf.layers.dense(inputs=hw0,
                                      units=self.embed_dim * self.n_agents,
                                      activation=None)
                return hw1

        def hyper_w_final(hyper_w_final_input):
            """
            Create hyper_w_final.

            input shape (none, state_dim)
            """
            with tf.variable_scope("hyper_w_final"):
                hw_f0 = tf.layers.dense(
                    inputs=hyper_w_final_input,
                    units=hypernet_embed,
                    activation=tf.nn.relu,
                )
                hw_f1 = tf.layers.dense(inputs=hw_f0,
                                        units=self.embed_dim,
                                        activation=None)
                return hw_f1

        def hyper_b1(state_input):
            """State dependent bias for hidden layer."""
            with tf.variable_scope("hyper_b1"):
                return tf.layers.dense(inputs=state_input,
                                       units=self.embed_dim,
                                       activation=None)

        def val(state_input):
            """V(s) instead of a bias for the last layers."""
            with tf.variable_scope("val_for_bias"):
                val0 = tf.layers.dense(inputs=state_input,
                                       units=self.embed_dim,
                                       activation=tf.nn.relu)
                val2 = tf.layers.dense(inputs=val0, units=1, activation=None)
                return val2

        bs = agent_qs.get_shape().as_list()[0]
        states_reshaped = tf.reshape(states, (-1, self.state_dim))
        agent_qs_reshaped = tf.reshape(agent_qs, (-1, 1, self.n_agents))

        # firstly layer
        w1 = tf.math.abs(hyper_w1(states_reshaped))
        b1 = hyper_b1(states_reshaped)

        w1_reshaped = tf.reshape(w1, (-1, self.n_agents, self.embed_dim))
        b1_reshaped = tf.reshape(b1, (-1, 1, self.embed_dim))

        to_hidden_val = tf.math.add(
            tf.matmul(agent_qs_reshaped, w1_reshaped), b1_reshaped)
        hidden = tf.nn.elu(to_hidden_val)

        # second layer
        w_final = tf.math.abs(hyper_w_final(states_reshaped))
        w_final_reshaped = tf.reshape(w_final, (-1, self.embed_dim, 1))

        # state-dependent bias
        v = tf.reshape(val(states_reshaped), (-1, 1, 1))

        # compute final output
        y = tf.math.add(tf.matmul(hidden, w_final_reshaped), v)

        # reshape and return
        q_tot = tf.reshape(y, (bs, -1, 1))

        return q_tot

    @staticmethod
    def _print_trainable_var_name(**kwargs):
        """Print trainable variable name."""
        for k, v in kwargs.items():
            logging.info("{}: \n {}".format(k, list([t.name for t in v])))

    def build_train_graph(self):
        """
        Build train graph.

        Because of the different seq_max(1 vs limit),
        train graph cannot connect-up to actor.graph directly.
        Hence, we build an explore sub-graph and train sub-graph,
        which sync with tf.assign between two collections.
        :return:
        """
        with self.graph.as_default():
            with tf.variable_scope("eval_agent"):
                trajectory_agent_outs, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    seq_max=self.fix_seq_length + 1,  # importance
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,  # total trajectory, needn't hold hidden
                )

            with tf.variable_scope("target_agent"):
                tar_agent_outs_tmp, _ = self.build_agent_net(
                    inputs_obs=self.ph_train_obs,
                    # fix value, different between explore and train
                    seq_max=self.fix_seq_length + 1,
                    obs_lengths=self.ph_train_obs_len,
                    hidden_state_in=None,
                )
                target_trajectory_agent_outs = tf.stop_gradient(tar_agent_outs_tmp)

            _eval_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_agent")
            _target_agent_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_agent")

            with tf.variable_scope("soft_replacement"):
                self.agent_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_agent_paras,
                                                    _eval_agent_paras)]

                self.agent_explore_replace_op = [
                    tf.assign(t, e) for t, e in zip(self._explore_paras,
                                                    _eval_agent_paras)
                ]

            self._print_trainable_var_name(
                _eval_agent_paras=_eval_agent_paras,
                _target_agent_paras=_target_agent_paras,
                _explore_paras=self._explore_paras,
            )

            # agent out to max q values
            # Calculate estimated Q-Values ----------------
            mac_out = tf.reshape(
                trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            logging.debug("mac_out: {}".format(mac_out))
            chosen_action_qvals = self.gather_custom(mac_out[:, :-1],
                                                     self.ph_actions)

            # Calculate the Q-Values necessary for the target -----------
            target_mac_out = tf.reshape(
                target_trajectory_agent_outs,
                [self.batch_size, self.fix_seq_length + 1, self.n_agents, -1],
            )
            target_mac_out = target_mac_out[:, 1:]

            # Mask out unavailable actions
            # target_mac_out[avail_actions[:, 1:] == 0] = -9999999
            indices = tf.equal(self.ph_avail_action[:, 1:], 0)
            mask_val = tf.tile(
                [[[[-999999.0]]]],
                [
                    self.batch_size,
                    self.fix_seq_length,
                    self.n_agents,
                    self.avail_action_num,
                ],
            )
            logging.debug("indices:{}, mask_val:{}, target mac out:{}".format(
                indices, mask_val, target_mac_out))

            target_mac_out = tf.where(indices, mask_val, target_mac_out)

            if self.use_double_q:
                # Get actions that maximise live Q (for double q-learning)
                mac_out_detach = tf.stop_gradient(tf.identity(mac_out[:, 1:]))
                mac_out_detach = tf.where(indices, mask_val, mac_out_detach)
                cur_max_actions = tf.expand_dims(
                    tf.argmax(mac_out_detach, axis=-1), -1)
                target_max_qvals = self.gather_custom(target_mac_out,
                                                      cur_max_actions)
            else:
                target_max_qvals = tf.reduce_max(target_mac_out, axis=[-1])

            # eval mixer ---------------
            with tf.variable_scope("eval_mixer"):
                self.q_tot = self._build_mix_net2(chosen_action_qvals,
                                                  self.ph_train_states)

            with tf.variable_scope("target_mixer"):
                q_tot_tmp = self._build_mix_net2(target_max_qvals,
                                                 self.ph_train_target_states)
                self.target_q_tot = tf.stop_gradient(q_tot_tmp)

            _eval_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="eval_mixer")
            _target_mix_paras = tf.get_collection(
                tf.GraphKeys.TRAINABLE_VARIABLES, scope="target_mixer")

            with tf.variable_scope("soft_replacement"):
                self.mix_train_replace_op = [
                    tf.assign(t, e) for t, e in zip(_target_mix_paras,
                                                    _eval_mix_paras)]

            self._print_trainable_var_name(_eval_mix_paras=_eval_mix_paras,
                                           _target_mix_paras=_target_mix_paras)

            # Calculate 1-step Q-Learning targets
            targets = (self.ph_rewards +
                       self.gamma * (1.0 - self.ph_terminated) * self.target_q_tot)

            # Td-error
            td_error = self.q_tot - tf.stop_gradient(targets)

            # mask = mask.expand_as(td_error)  #fixme: default as same shape!

            # 0-out the targets that came from padded data
            masked_td_error = tf.multiply(td_error, self.ph_mask)

            self.loss = tf.reduce_sum(masked_td_error**2) / tf.reduce_sum(self.ph_mask)

            # Optimise
            optimizer = tf.train.RMSPropOptimizer(
                self.lr, decay=0.95, epsilon=1.5e-7, centered=True)
            grads_and_vars = optimizer.compute_gradients(self.loss)
            capped_gvs = [(
                grad if grad is None else tf.clip_by_norm(
                    grad, clip_norm=self.grad_norm_clip),
                var,
            ) for grad, var in grads_and_vars]
            self.grad_update = optimizer.apply_gradients(capped_gvs)

    def assign_targets(self):
        """
        Update weights periodically.

        1. from eval agent to target agent
        2. from target mixer to eval mixer
        :return:
        """
        _a, _m = self.sess.run([self.agent_train_replace_op,
                                self.mix_train_replace_op])

    def assign_explore_agent(self):
        """
        Update explore agent after each train process.

        :return:
        """
        _ = self.sess.run(self.agent_explore_replace_op)

    def save_explore_agent_weights(self, save_path):
        """Save explore agent weight for explorer."""
        # explore_saver = tf.train.Saver({t.name: t for t in self._explore_paras})
        self.explore_saver.save(
            self.sess, save_path=save_path, write_meta_graph=False)
        # tf.train.list_variables(tf.train.latest_checkpoint(wp))

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        with self.graph.as_default():
            self.actor_var.set_weights(weights)

    def get_weights(self):
        """Get the weights."""
        with self.graph.as_default():
            return self.actor_var.get_weights()

    def restore_explorer_variable(self, model_name):
        """Restore explorer variable with tf.train.checkpoint."""
        reader = tf.train.NewCheckpointReader(model_name)
        var_names = reader.get_variable_to_shape_map().keys()
        result = {}
        for n in var_names:
            result[n] = reader.get_tensor(n)
            logging.debug("read variable-{} from file:{}".format(n, model_name))
        with self.sess.as_default():  # must been sess
            for var_key in self._explore_paras:
                try:
                    var_key.load(result[var_key.name])
                    logging.debug("load {} success".format(var_key.name))
                except BaseException as err:
                    raise KeyError("update {} error:{}".format(var_key.name, err))

    def train(
            self,
            batch_trajectories,
            train_obs_len,
            avail_actions,
            actions,
            cur_stats,
            target_stats,
            rewards,
            terminated,
            mask):
        """
        Train with the whole graph.

        Update explorer.graph after each train process, and target as required.

        :param batch_trajectories:
        :param train_obs_len: list([max_ep for _ in range(batch.batch_size * n_agents)]
        :param avail_actions: avail action from environment
        :param actions: actual actions within trajectory
        :param cur_stats: batch["state"][:, :-1]
        :param target_stats: batch["state"][:, 1:]
        :param rewards:
        :param terminated:
        :param mask:
        :return:
        """
        _, loss_val = self.sess.run(
            [self.grad_update, self.loss],
            feed_dict={
                self.ph_train_obs: batch_trajectories,
                # Note: split trajectory with each agent.
                self.ph_train_obs_len: train_obs_len,
                self.ph_avail_action: avail_actions,
                self.ph_actions: actions,
                self.ph_train_states: cur_stats,
                self.ph_train_target_states: target_stats,
                self.ph_rewards: rewards,
                self.ph_terminated: terminated,
                self.ph_mask: mask,
            },
        )
        logging.debug("train_loss: {}".format(loss_val))
        return loss_val
Example #8
0
    def create_model(self, model_info):
        """Create Deep-Q network."""

        user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type)
        history_click_input = Input(
            shape=(self.n_history_click * self.item_dim), name="history_click",
            dtype=self.input_type
        )
        history_no_click_input = Input(
            shape=(self.n_history_no_click * self.item_dim), name="history_no_click",
            dtype=self.input_type
        )
        item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type)
        shared_embedding = Embedding(
            self.vocab_size,
            self.emb_dim,
            name="Emb",
            mask_zero=True,
            embeddings_initializer=self.embedding_initializer,
            trainable=False,
        )  # un-trainable
        gru_click = GRU(self.item_dim * self.emb_dim)
        gru_no_click = GRU(self.item_dim * self.emb_dim)

        user_feature = Flatten()(shared_embedding(user_input))
        item_feature = Flatten()(shared_embedding(item_input))

        history_click_feature = Reshape(
            (self.n_history_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_click_input))
        history_click_feature = gru_click(history_click_feature)

        history_no_click_feature = Reshape(
            (self.n_history_no_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_no_click_input))
        history_no_click_feature = gru_no_click(history_no_click_feature)

        x = concatenate(
            [
                user_feature,
                history_click_feature,
                history_no_click_feature,
                item_feature,
            ]
        )
        x_dense1 = Dense(128, activation="relu")(x)
        x_dense2 = Dense(128, activation="relu")(x_dense1)
        # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2)
        ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2)
        model = Model(
            inputs=[
                user_input,
                history_click_input,
                history_no_click_input,
                item_input,
            ],
            outputs=ctr_pred,
        )
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        if self._summary:
            model.summary()

        self.user_input = tf.placeholder(
            dtype=self.input_type, name="user_input", shape=(None, self.user_dim)
        )
        self.history_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_click_input",
            shape=(None, self.n_history_click * self.item_dim),
        )
        self.history_no_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_no_click_input",
            shape=(None, self.n_history_no_click * self.item_dim),
        )
        self.item_input = tf.placeholder(
            dtype=self.input_type, name="item_input", shape=(None, self.item_dim)
        )

        self.ctr_predict = model(
            [
                self.user_input,
                self.history_click_input,
                self.history_no_click_input,
                self.item_input,
            ]
        )
        self.actor_var = TFVariables([self.ctr_predict], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
Example #9
0
class DqnInfoFlowModel(XTModel):
    """DQN Class for information flow."""

    def __init__(self, model_info):
        """Init Dqn model for information flow."""
        model_config = model_info.get("model_config", None)
        import_config(globals(), model_config)

        self.state_dim = model_info["state_dim"]
        self.action_dim = model_info["action_dim"]

        self.tau = 0.01
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.vocab_size = model_info["vocab_size"]
        self.emb_dim = model_info["emb_dim"]
        self.user_dim = model_info["user_dim"]
        self.item_dim = model_info["item_dim"]

        self.input_type = model_info["input_type"]
        # logging.info("set input type: {}".format(self.input_type))

        self.embeddings = model_info["embeddings"]
        self.last_act = model_info["last_activate"]

        embedding_weights = np.loadtxt(self.embeddings, delimiter=",", dtype=float)
        self.embedding_initializer = tf.constant_initializer(embedding_weights)

        self.n_history_click = 5
        self.n_history_no_click = 5

        super().__init__(model_info)

    def create_model(self, model_info):
        """Create Deep-Q network."""

        user_input = Input(shape=(self.user_dim,), name="user_input", dtype=self.input_type)
        history_click_input = Input(
            shape=(self.n_history_click * self.item_dim), name="history_click",
            dtype=self.input_type
        )
        history_no_click_input = Input(
            shape=(self.n_history_no_click * self.item_dim), name="history_no_click",
            dtype=self.input_type
        )
        item_input = Input(shape=(self.item_dim,), name="item_input", dtype=self.input_type)
        shared_embedding = Embedding(
            self.vocab_size,
            self.emb_dim,
            name="Emb",
            mask_zero=True,
            embeddings_initializer=self.embedding_initializer,
            trainable=False,
        )  # un-trainable
        gru_click = GRU(self.item_dim * self.emb_dim)
        gru_no_click = GRU(self.item_dim * self.emb_dim)

        user_feature = Flatten()(shared_embedding(user_input))
        item_feature = Flatten()(shared_embedding(item_input))

        history_click_feature = Reshape(
            (self.n_history_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_click_input))
        history_click_feature = gru_click(history_click_feature)

        history_no_click_feature = Reshape(
            (self.n_history_no_click, self.item_dim * self.emb_dim)
        )(shared_embedding(history_no_click_input))
        history_no_click_feature = gru_no_click(history_no_click_feature)

        x = concatenate(
            [
                user_feature,
                history_click_feature,
                history_no_click_feature,
                item_feature,
            ]
        )
        x_dense1 = Dense(128, activation="relu")(x)
        x_dense2 = Dense(128, activation="relu")(x_dense1)
        # ctr_pred = Dense(1, activation="linear", name="q_value")(x_dense2)
        ctr_pred = Dense(1, activation=self.last_act, name="q_value")(x_dense2)
        model = Model(
            inputs=[
                user_input,
                history_click_input,
                history_no_click_input,
                item_input,
            ],
            outputs=ctr_pred,
        )
        model.compile(loss="mse", optimizer=Adam(lr=self.learning_rate))
        if self._summary:
            model.summary()

        self.user_input = tf.placeholder(
            dtype=self.input_type, name="user_input", shape=(None, self.user_dim)
        )
        self.history_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_click_input",
            shape=(None, self.n_history_click * self.item_dim),
        )
        self.history_no_click_input = tf.placeholder(
            dtype=self.input_type,
            name="history_no_click_input",
            shape=(None, self.n_history_no_click * self.item_dim),
        )
        self.item_input = tf.placeholder(
            dtype=self.input_type, name="item_input", shape=(None, self.item_dim)
        )

        self.ctr_predict = model(
            [
                self.user_input,
                self.history_click_input,
                self.history_no_click_input,
                self.item_input,
            ]
        )
        self.actor_var = TFVariables([self.ctr_predict], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model

    def train(self, state, label, batch_size, verbose=False):
        """Train the model."""
        with self.graph.as_default():
            K.set_session(self.sess)
            history = self.model.fit(
                state, label, batch_size=batch_size, verbose=verbose
            )
            return history.history["loss"][0]

    def predict(self, state):
        """
        Do predict use the newest model.

        :param state:
        :return:
        """
        with self.graph.as_default():
            # K.set_session(self.sess)
            # return np.array(self.model.predict_on_batch(state)).reshape(-1)
            feed_dict = {
                self.user_input: state["user_input"],
                self.history_click_input: state["history_click"],
                self.history_no_click_input: state["history_no_click"],
                self.item_input: state["item_input"],
            }
            return np.array(self.sess.run(self.ctr_predict, feed_dict)).reshape(-1)

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        # split keras and xingtian npz
        with self.graph.as_default():
            K.set_session(self.sess)
            if isinstance(weights, dict) and self.actor_var:
                self.actor_var.set_weights(weights)
            else:  # keras
                self.model.set_weights(weights)

    def get_weights(self):
        """Get the weights."""
        with self.graph.as_default():
            K.set_session(self.sess)
            return self.model.get_weights()

    def load_model(self, model_name):
        if self.actor_var and str(model_name).endswith(".npz"):
            self.actor_var.set_weights_with_npz(model_name)
        else:
            with self.graph.as_default():
                K.set_session(self.sess)
                self.model.load_weights(model_name)
Example #10
0
class ImpalaCnnOpt(XTModel):
    """Docstring for ActorNetwork."""
    def __init__(self, model_info):
        model_config = model_info.get("model_config", dict())
        import_config(globals(), model_config)
        self.dtype = DTYPE_MAP.get(model_info.get("default_dtype", "float32"))
        self.input_dtype = model_info.get("input_dtype", "float32")
        self.sta_mean = model_info.get("state_mean", 0.)
        self.sta_std = model_info.get("state_std", 255.)

        self._transform = partial(state_transform,
                                  mean=self.sta_mean,
                                  std=self.sta_std,
                                  input_dtype=self.input_dtype)

        self.state_dim = model_info["state_dim"]
        self.action_dim = model_info["action_dim"]
        self.filter_arch = get_atari_filter(self.state_dim)

        # lr schedule with linear_cosine_decay
        self.lr_schedule = model_config.get("lr_schedule", None)
        self.opt_type = model_config.get("opt_type", "adam")
        self.lr = None

        self.ph_state = None
        self.ph_adv = None
        self.out_actions = None
        self.pi_logic_outs, self.baseline = None, None

        # placeholder for behavior policy logic outputs
        self.ph_bp_logic_outs = None
        self.ph_actions = None
        self.ph_dones = None
        self.ph_rewards = None
        self.loss, self.optimizer, self.train_op = None, None, None

        self.grad_norm_clip = model_config.get("grad_norm_clip", 40.0)
        self.sample_batch_steps = model_config.get("sample_batch_step", 50)

        self.saver = None
        self.explore_paras = None
        self.actor_var = None  # store weights for agent

        super().__init__(model_info)

    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True

    def _get_lr(self, global_step, decay_step=20000.):
        """Make decay learning rate."""
        lr_schedule = self.lr_schedule
        if len(lr_schedule) != 2:
            logging.warning("Need 2 elements in lr_schedule!\n, "
                            "likes [[0, 0.01], [20000, 0.000001]]")
            logging.fatal("lr_schedule invalid: {}".format(lr_schedule))

        if lr_schedule[0][0] != 0:
            logging.info("lr_schedule[0][1] could been init learning rate")

        learning_rate = linear_cosine_decay(lr_schedule[0][1],
                                            global_step,
                                            decay_step,
                                            beta=lr_schedule[1][1] /
                                            float(decay_step))

        return learning_rate

    def train(self, state, label):
        """Train with sess.run."""
        bp_logic_outs, actions, dones, rewards = label
        with self.graph.as_default():
            _, loss = self.sess.run(
                [self.train_op, self.loss],
                feed_dict={
                    self.ph_state: state,
                    self.ph_bp_logic_outs: bp_logic_outs,
                    self.ph_actions: actions,
                    self.ph_dones: dones,
                    self.ph_rewards: rewards,
                },
            )
        return loss

    def predict(self, state):
        """
        Do predict use the newest model.

        :param: state
        :return: action_logits, action_val, value
        """
        with self.graph.as_default():
            feed_dict = {self.ph_state: state}
            return self.sess.run(
                [self.pi_logic_outs, self.baseline, self.out_actions],
                feed_dict)

    def save_model(self, file_name):
        """Save model without meta graph."""
        ck_name = self.saver.save(self.sess,
                                  save_path=file_name,
                                  write_meta_graph=False)
        return ck_name

    def load_model(self, model_name, by_name=False):
        """Load model with inference variables."""
        restore_tf_variable(self.sess, self.explore_paras, model_name)

    def set_weights(self, weights):
        """Set weight with memory tensor."""
        with self.graph.as_default():
            self.actor_var.set_weights(weights)

    def get_weights(self):
        """Get weights."""
        with self.graph.as_default():
            return self.actor_var.get_weights()
Example #11
0
    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True