Esempio n. 1
0
    def create_model(self, model_info):
        """Create Deep-Q network."""
        state = Input(shape=self.state_dim)
        denselayer = Dense(HIDDEN_SIZE, activation='relu')(state)
        for _ in range(NUM_LAYERS - 1):
            denselayer = Dense(HIDDEN_SIZE, activation='relu')(denselayer)

        value = Dense(self.action_dim, activation='linear')(denselayer)
        if self.dueling:
            adv = Dense(1, activation='linear')(denselayer)
            mean = Lambda(layer_normalize)(value)
            value = Lambda(layer_add)([adv, mean])

        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate)
        model.compile(loss='mse', optimizer=adam)

        self.infer_state = tf.placeholder(tf.float32,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
Esempio n. 2
0
    def create_model(self, model_info):
        """Create Deep-Q CNN network."""
        state = Input(shape=self.state_dim, dtype="uint8")
        state1 = Lambda(lambda x: K.cast(x, dtype='float32') / 255.)(state)
        convlayer = Conv2D(32, (8, 8),
                           strides=(4, 4),
                           activation='relu',
                           padding='valid')(state1)
        convlayer = Conv2D(64, (4, 4),
                           strides=(2, 2),
                           activation='relu',
                           padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3),
                           strides=(1, 1),
                           activation='relu',
                           padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu')(flattenlayer)
        value = Dense(self.action_dim, activation='linear')(denselayer)
        model = Model(inputs=state, outputs=value)
        adam = Adam(lr=self.learning_rate, clipnorm=10.)
        model.compile(loss='mse', optimizer=adam)
        if model_info.get("summary"):
            model.summary()

        self.infer_state = tf.placeholder(tf.uint8,
                                          name="infer_input",
                                          shape=(None, ) +
                                          tuple(self.state_dim))
        self.infer_v = model(self.infer_state)
        self.actor_var = TFVariables([self.infer_v], self.sess)

        self.sess.run(tf.initialize_all_variables())
        return model
Esempio n. 3
0
    def create_model(self, model_info):
        state_input = Input(shape=self.state_dim, name='state_input', dtype='uint8')
        state_input_1 = Lambda(layer_function)(state_input)
        advantage = Input(shape=(1, ), name='adv')

        convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state_input_1)
        convlayer = Conv2D(64, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu')(flattenlayer)

        out_actions = Dense(self.action_dim, activation='softmax', name='output_actions')(denselayer)
        out_value = Dense(1, name='output_value')(denselayer)
        model = Model(inputs=[state_input, advantage], outputs=[out_actions, out_value])
        losses = {"output_actions": impala_loss(advantage), "output_value": 'mse'}
        lossweights = {"output_actions": 1.0, "output_value": .5}

        decay_value = 0.00000000512
        model.compile(optimizer=Adam(lr=LR, clipnorm=40., decay=decay_value), loss=losses, loss_weights=lossweights)

        self.infer_state = tf.placeholder(tf.uint8, name="infer_state",
                                          shape=(None,) + tuple(self.state_dim))
        self.adv = tf.placeholder(tf.float32, name="adv", shape=(None, 1))
        self.infer_p, self.infer_v = model([self.infer_state, self.adv])
        self.sess.run(tf.initialize_all_variables())

        return model
Esempio n. 4
0
 def create_rep_network(self):
     obs = Input(shape=self.state_dim, name='rep_input')
     obs_1 = Lambda(lambda x: tf.cast(x, dtype='float32') / 255.)(obs)
     convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(obs_1)
     convlayer = Conv2D(32, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer)
     convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer)
     flattenlayer = Flatten()(convlayer)
     denselayer = Dense(HIDDEN_OUT, activation='relu')(flattenlayer)
     # hidden = Lambda(hidden_normlize)(denselayer)
     hidden = denselayer
     return Model(inputs=obs, outputs=hidden)
Esempio n. 5
0
def get_cnn_backbone(state_dim,
                     act_dim,
                     hidden_sizes,
                     activation,
                     filter_arches,
                     vf_share_layers=True,
                     summary=False,
                     dtype='uint8'):
    """Get CNN backbone."""
    state_input_raw = Input(shape=state_dim, name='obs')
    if dtype == 'uint8':
        state_input = Lambda(layer_function)(state_input_raw)
    elif dtype == 'float32':
        state_input = state_input_raw
    else:
        raise ValueError(
            'dtype: {} not supported automatically, please implement it yourself'
            .format(dtype))

    if vf_share_layers:
        conv_layer = build_conv_layers(state_input, filter_arches, activation,
                                       'shared')
        flatten_layer = Flatten()(conv_layer)
        dense_layer = bulid_mlp_layers(flatten_layer, hidden_sizes, activation,
                                       'shared')
        pi_latent = Dense(act_dim, activation=None,
                          name='pi_latent')(dense_layer)
        out_value = Dense(1, activation=None, name='output_value')(dense_layer)
    else:
        conv_layer_pi = build_conv_layers(state_input, filter_arches,
                                          activation, 'pi')
        conv_layer_v = build_conv_layers(state_input, filter_arches,
                                         activation, 'v')
        flatten_layer_pi = Flatten()(conv_layer_pi)
        flatten_layer_v = Flatten()(conv_layer_v)
        dense_layer_pi = bulid_mlp_layers(flatten_layer_pi, hidden_sizes,
                                          activation, 'pi')
        dense_layer_v = bulid_mlp_layers(flatten_layer_v, hidden_sizes,
                                         activation, 'v')
        pi_latent = Dense(act_dim, activation=None,
                          name='pi_latent')(dense_layer_pi)
        out_value = Dense(1, activation=None,
                          name='output_value')(dense_layer_v)

    model = Model(inputs=[state_input_raw], outputs=[pi_latent, out_value])
    if summary:
        model.summary()

    return model
Esempio n. 6
0
    def create_model(self, model_info):
        state_input = Input(shape=self.state_dim, name='state_input', dtype='uint8')
        state_input_1 = Lambda(layer_function)(state_input)
        convlayer = Conv2D(32, (8, 8), strides=(4, 4), activation='relu', padding='valid')(state_input_1)
        convlayer = Conv2D(32, (4, 4), strides=(2, 2), activation='relu', padding='valid')(convlayer)
        convlayer = Conv2D(64, (3, 3), strides=(1, 1), activation='relu', padding='valid')(convlayer)
        flattenlayer = Flatten()(convlayer)
        denselayer = Dense(256, activation='relu', name='dense_1')(flattenlayer)
        out_actions = Dense(self.action_dim, activation='softmax', name='output_actions_raw')(denselayer)
        out_value = Dense(1, name='output_value')(denselayer)
        model = Model(inputs=[state_input], outputs=[out_actions, out_value])

        self.build_graph(np.uint8, model)

        return model
Esempio n. 7
0
def get_cnn_backbone(state_dim,
                     act_dim,
                     hidden_sizes,
                     activation,
                     filter_arches,
                     vf_share_layers=True,
                     summary=False):
    """Get CNN backbone."""
    state_input_raw = Input(shape=state_dim, name='obs')
    state_input = Lambda(layer_function)(state_input_raw)

    if vf_share_layers:
        conv_layer = build_conv_layers(state_input, filter_arches, activation,
                                       'shared')
        flatten_layer = Flatten()(conv_layer)
        dense_layer = bulid_mlp_layers(flatten_layer, hidden_sizes, activation,
                                       'shared')
        pi_latent = Dense(act_dim, activation=None,
                          name='pi_latent')(dense_layer)
        out_value = Dense(1, activation=None, name='output_value')(dense_layer)
    else:
        conv_layer_pi = build_conv_layers(state_input, filter_arches,
                                          activation, 'pi')
        conv_layer_v = build_conv_layers(state_input, filter_arches,
                                         activation, 'v')
        flatten_layer_pi = Flatten()(conv_layer_pi)
        flatten_layer_v = Flatten()(conv_layer_v)
        dense_layer_pi = bulid_mlp_layers(flatten_layer_pi, hidden_sizes,
                                          activation, 'pi')
        dense_layer_v = bulid_mlp_layers(flatten_layer_v, hidden_sizes,
                                         activation, 'v')
        pi_latent = Dense(act_dim, activation=None,
                          name='pi_latent')(dense_layer_pi)
        out_value = Dense(1, activation=None,
                          name='output_value')(dense_layer_v)

    model = Model(inputs=[state_input_raw], outputs=[pi_latent, out_value])
    if summary:
        model.summary()

    return model
Esempio n. 8
0
    def create_model(self, model_info):
        self.ph_state = tf.placeholder(self.input_dtype,
                                       shape=(None, *self.state_dim),
                                       name="state_input")

        with tf.variable_scope("explore_agent"):
            state_input = Lambda(self._transform)(self.ph_state)
            last_layer = state_input

            for (out_size, kernel, stride) in self.filter_arch[:-1]:
                last_layer = Conv2D(
                    out_size,
                    (kernel, kernel),
                    strides=(stride, stride),
                    activation="relu",
                    padding="same",
                )(last_layer)

            # last convolution
            (out_size, kernel, stride) = self.filter_arch[-1]
            convolution_layer = Conv2D(
                out_size,
                (kernel, kernel),
                strides=(stride, stride),
                activation="relu",
                padding="valid",
            )(last_layer)

            self.pi_logic_outs = tf.squeeze(
                Conv2D(self.action_dim, (1, 1),
                       padding="same")(convolution_layer),
                axis=[1, 2],
            )

            baseline_flat = Flatten()(convolution_layer)
            self.baseline = tf.squeeze(
                tf.layers.dense(
                    inputs=baseline_flat,
                    units=1,
                    activation=None,
                    kernel_initializer=custom_norm_initializer(0.01),
                ),
                1,
            )
            self.out_actions = tf.squeeze(
                tf.multinomial(self.pi_logic_outs,
                               num_samples=1,
                               output_dtype=tf.int32),
                1,
                name="out_action",
            )

        # create learner
        self.ph_bp_logic_outs = tf.placeholder(self.dtype,
                                               shape=(None, self.action_dim),
                                               name="ph_b_logits")

        self.ph_actions = tf.placeholder(tf.int32,
                                         shape=(None, ),
                                         name="ph_action")
        self.ph_dones = tf.placeholder(tf.bool,
                                       shape=(None, ),
                                       name="ph_dones")
        self.ph_rewards = tf.placeholder(self.dtype,
                                         shape=(None, ),
                                         name="ph_rewards")

        # Split the tensor into batches at known episode cut boundaries.
        # [batch_count * batch_step] -> [batch_step, batch_count]
        batch_step = self.sample_batch_steps

        def split_batches(tensor, drop_last=False):
            batch_count = tf.shape(tensor)[0] // batch_step
            reshape_tensor = tf.reshape(
                tensor,
                tf.concat([[batch_count, batch_step],
                           tf.shape(tensor)[1:]],
                          axis=0),
            )

            # swap B and T axes
            res = tf.transpose(
                reshape_tensor,
                [1, 0] + list(range(2, 1 + int(tf.shape(tensor).shape[0]))),
            )

            if drop_last:
                return res[:-1]
            return res

        self.loss = vtrace_loss(
            bp_logic_outs=split_batches(self.ph_bp_logic_outs, drop_last=True),
            tp_logic_outs=split_batches(self.pi_logic_outs, drop_last=True),
            actions=split_batches(self.ph_actions, drop_last=True),
            discounts=split_batches(tf.cast(~self.ph_dones, tf.float32) *
                                    GAMMA,
                                    drop_last=True),
            rewards=split_batches(tf.clip_by_value(self.ph_rewards, -1, 1),
                                  drop_last=True),
            values=split_batches(self.baseline, drop_last=True),
            bootstrap_value=split_batches(self.baseline)[-1],
        )

        global_step = tf.Variable(0, trainable=False, dtype=tf.int32)
        if self.opt_type == "adam":
            if self.lr_schedule:
                learning_rate = self._get_lr(global_step)
            else:
                learning_rate = LR
            optimizer = AdamOptimizer(learning_rate)
        elif self.opt_type == "rmsprop":
            optimizer = tf.train.RMSPropOptimizer(LR,
                                                  decay=0.99,
                                                  epsilon=0.1,
                                                  centered=True)
        else:
            raise KeyError("invalid opt_type: {}".format(self.opt_type))

        grads_and_vars = optimizer.compute_gradients(self.loss)

        # global norm
        grads, var = zip(*grads_and_vars)
        grads, _ = tf.clip_by_global_norm(grads, self.grad_norm_clip)
        clipped_gvs = list(zip(grads, var))

        self.train_op = optimizer.apply_gradients(clipped_gvs,
                                                  global_step=global_step)

        # fixme: help to show the learning rate among training processing
        self.lr = optimizer._lr

        self.actor_var = TFVariables(self.out_actions, self.sess)

        self.sess.run(global_variables_initializer())

        self.explore_paras = tf.get_collection(
            tf.GraphKeys.TRAINABLE_VARIABLES, scope="explore_agent")

        self.saver = Saver({t.name: t
                            for t in self.explore_paras},
                           max_to_keep=self.max_to_keep)

        return True