Esempio n. 1
0
    def build(self, input_shape):

        self.dense_1 = Dense(4 * self.output_dim,
                             kernel_initializer=tn(stddev=self.init_range))
        self.dense_1.build(input_shape)
        self._trainable_weights += self.dense_1.trainable_weights

        self.dense_2 = Dense(self.output_dim,
                             kernel_initializer=tn(stddev=self.init_range))
        self.dense_2.build(
            (input_shape[0], input_shape[1], 4 * self.output_dim))
        self._trainable_weights += self.dense_2.trainable_weights

        # Multi Head Attention #
        self.multihead_attention = MultiHeadAttention(self.attention_dim,
                                                      self.n_heads,
                                                      self.init_range)
        self.multihead_attention.build(input_shape)
        self._trainable_weights += self.multihead_attention.trainable_weights

        # LayerNorm #
        self.layer_normalization_1 = LayerNormalization()
        self.layer_normalization_1.build(input_shape)
        self._trainable_weights += self.layer_normalization_1.trainable_weights

        # LayerNorm #
        self.layer_normalization_2 = LayerNormalization()
        self.layer_normalization_2.build(input_shape)
        self._trainable_weights += self.layer_normalization_2.trainable_weights

        # Gelu #
        self.gelu = Gelu()
        self.gelu.build((input_shape[0], input_shape[1], 4 * self.output_dim))

        super(SentenceEncoderBlock, self).build(input_shape)
Esempio n. 2
0
    def create_policy_network(self, state_dim, action_dim):
        # build network model
        S = Input(shape=[state_dim])
        h0 = Dense(self.HIDDEN1_UNITS,
                   activation='elu',
                   kernel_initializer=tn(mean=0.0, stddev=1e-4))(S)
        h1 = Dense(self.HIDDEN2_UNITS,
                   activation='elu',
                   kernel_initializer=tn(mean=0.0, stddev=1e-4))(h0)
        V = Dense(action_dim,
                  activation='tanh',
                  kernel_initializer=tn(mean=0.0, stddev=1e-4))(h1)
        model = Model(inputs=S, outputs=V)

        adam = Adam(lr=self.lr)
        model.compile(loss=self.BATCH_LOSS, optimizer=adam)

        return model, S
Esempio n. 3
0
	def create_qvalue_network(self, state_dim, action_dim):
		# build network model
		S = Input(shape=[state_dim])
		w1 = Dense(self.HIDDEN1_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-2))(S)
		w2 = Dense(self.HIDDEN1_UNITS, activation='linear', kernel_initializer=tn(mean=0.0, stddev=1e-2))(w1)

		A = Input(shape=[action_dim])
		a1 = Dense(self.HIDDEN2_UNITS, activation='linear', kernel_initializer=tn(mean=0.0, stddev=1e-2))(A)

		h1 = layers.concatenate([w2,a1])
		h2 = Dense(self.HIDDEN2_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-2))(h1)
		V = Dense(1, activation='linear')(h2) 
		model = Model(inputs=[S,A], outputs=V)

		adam = Adam(lr=self.lr)
		model.compile(loss=self.BATCH_LOSS, optimizer=adam)
	
		return model, A, S
Esempio n. 4
0
    def build(self, input_shape):
        self.wq = self.add_weight(shape=(input_shape[-1], self.d),
                                  name="Wq",
                                  initializer=tn(
                                      stddev=self.init_range),
                                  trainable=True)

        self.wk = self.add_weight(shape=(input_shape[-1], self.d),
                                  name="Wk",
                                  initializer=tn(
                                      stddev=self.init_range),
                                  trainable=True)

        self.wv = self.add_weight(shape=(input_shape[-1], self.d),
                                  name="Wv",
                                  initializer=tn(
                                      stddev=self.init_range),
                                  trainable=True)

        super(SelfAttention, self).build(input_shape)
Esempio n. 5
0
    def build(self, input_shape):

        self.w = self.add_weight(shape=(self.d * self.n_heads,
                                        input_shape[-1]),
                                 name="w",
                                 initializer=tn(stddev=self.init_range),
                                 trainable=True)

        for i in range(self.n_heads):
            self.heads[i] = SelfAttention(self.d, self.init_range)
            self.heads[i].build(input_shape)
            self._trainable_weights += self.heads[i].trainable_weights

        super(MultiHeadAttention, self).build(input_shape)
Esempio n. 6
0
    def build(self, input_shape):
        # Query Layers #
        self.query_layers = []
        for i in range(self.n_heads):
            self.query_layers.append(Dense(self.k_dim, kernel_initializer=tn(
                stddev=self.init_range)))

        for i in range(self.n_heads):
            self.query_layers[i].build(input_shape)
            self._trainable_weights += self.query_layers[i].trainable_weights

        # Value Embeddings #
        self.values = Embedding(self.memory_size ** 2, self.output_dim,
                                embeddings_initializer=tn(
                                   stddev=self.init_range))

        self.values.build(input_shape)

        self._trainable_weights += self.values.trainable_weights

        # Keys #
        self._trainable_weights += [self.keys]

        super(PKM, self).build(input_shape)
Esempio n. 7
0
    def build(self):

        with tf.device("/device:GPU:0"):
            input_tokens = Input(shape=(None, ))
            input_positions = Input(shape=(None, ))
            input_segments = Input(shape=(None, ))

            token_embedding_matrix = Embedding(
                self.vocab_size + 1,
                self.embedding_size,
                input_length=self.input_length,
                embeddings_initializer=tn(stddev=self.init_range))

            pos_embedding_matrix = Embedding(
                (2 * self.max_len) + 4,
                self.embedding_size,
                input_length=self.input_length,
                embeddings_initializer=tn(stddev=self.init_range))

            seg_embedding_matrix = Embedding(
                2,
                self.embedding_size,
                input_length=self.input_length,
                embeddings_initializer=tn(stddev=self.init_range))

            token_embeddings = token_embedding_matrix(input_tokens)
            position_embeddings = pos_embedding_matrix(input_positions)
            segment_embeddings = seg_embedding_matrix(input_segments)

            sum_embeddings = Add()([token_embeddings, position_embeddings])
            sum_embeddings = Add()([sum_embeddings, segment_embeddings])

            if self.factorize_embeddings:
                sum_embeddings = Dense(
                    self.encoder_size[0],
                    kernel_initializer=tn(
                        stddev=self.init_range))(sum_embeddings)
                sum_embeddings = Gelu()(sum_embeddings)

            if self.input_dropout != 0.:
                sum_embeddings = SpatialDropout1D(
                    self.input_dropout)(sum_embeddings)

            ant_layer = sum_embeddings

            encoders = []

            if self.cross_sharing:
                first_encoder = SentenceEncoderBlock(
                    self.encoder_size[0],
                    self.attention_size[0],
                    self.n_heads[0],
                    dropout=self.output_dropout,
                    init_range=self.init_range)

        flag_mem = 0
        for i in range(self.n_encoders):

            if self.pkm and i in self.pkm_params["in_layers"]:
                encoders.append(
                    SentenceEncoderMemoryBlock(self.encoder_size[0],
                                               self.attention_size[0],
                                               self.n_heads[0],
                                               self.pkm_params,
                                               dropout=self.output_dropout,
                                               init_range=self.init_range))
                flag_mem = 1
            else:
                if self.cross_sharing:
                    encoders.append(first_encoder)
                else:
                    encoders.append(
                        SentenceEncoderBlock(self.encoder_size[0],
                                             self.attention_size[0],
                                             self.n_heads[0],
                                             dropout=self.output_dropout,
                                             init_range=self.init_range))

            if flag_mem == 1:
                with tf.device("/device:GPU:1"):
                    encoded = encoders[-1](ant_layer)
                    ant_layer = encoded
                flag_mem = 0
                #print("Layer: %d -> %s : Allocated in GPU: %d" % (
                #    i, encoders[-1], 1))
            else:
                with tf.device("/device:GPU:%d" % (i % 2)):
                    encoded = encoders[-1](ant_layer)
                    ant_layer = encoded
                #print("Layer: %d -> %s : Allocated in GPU: %d" % (
                #    i, encoders[-1], (i % 2)))

        # Reply Order Prediction #
        if self.use_rop:
            cls_output = Lambda(lambda x: x[:, 0, :])(ant_layer)
            rop_hidden = cls_output
            for i in range(self.rop_n_hidden):
                rop_hidden = Dense(self.rop_hidden_size,
                                   kernel_initializer=tn(
                                       self.init_range))(rop_hidden)
                rop_hidden = Gelu()(rop_hidden)
                rop_hidden = LayerNormalization()(rop_hidden)

            output_reply_tweet = Dense(2,
                                       activation="softmax",
                                       kernel_initializer=tn(self.init_range),
                                       name="rop")(rop_hidden)

        mlm_outputs = TimeDistributed(Dense(self.vocab_size,
                                            activation="softmax",
                                            kernel_initializer=tn(
                                                self.init_range)),
                                      name="mlm")(ant_layer)

        if self.use_rop:
            self.model = Model(
                inputs=[input_tokens, input_positions, input_segments],
                outputs=[output_reply_tweet, mlm_outputs])
        else:
            self.model = Model(
                inputs=[input_tokens, input_positions, input_segments],
                outputs=[mlm_outputs])

        self.pretrained_model = Model(
            inputs=[input_tokens, input_positions, input_segments],
            outputs=ant_layer)
Esempio n. 8
0
def finetune_ffn(pretrained_model,
                 n_classes,
                 trainable_layers="all",
                 collapse_mode="cls",
                 finetune_dropout=0.15,
                 loss="categorical_crossentropy",
                 init_range=0.02,
                 lr=0.001,
                 multi_label=False,
                 optimizer="adam",
                 accum_iters=1):

    assert collapse_mode in ["cls", "max", "avg", "concat"]
    if trainable_layers != "all":
        assert type(trainable_layers) == list
        model_layers = []
        for layer in pretrained_model.layers:
            layer.trainable = False
            if "embedding" in layer.name or "encoder" in layer.name:
                model_layers.append(layer)
        for k in trainable_layers:
            model_layers[k].trainable = True

    input_tokens = Input(shape=(None, ))
    input_positions = Input(shape=(None, ))
    input_segments = Input(shape=(None, ))

    pretrained_output = pretrained_model(
        [input_tokens, input_positions, input_segments])

    if collapse_mode == "cls":
        cls_output = Lambda(lambda x: x[:, 0, :])(pretrained_output)

    else:

        if collapse_mode == "avg":
            cls_output = GlobalAveragePooling1D()(pretrained_output)
        elif collapse_mode == "max":
            cls_output = GlobalMaxPooling1D()(pretrained_output)
        elif collapse_mode == "concat":
            avg = GlobalAveragePooling1D()(pretrained_output)
            mx = GlobalMaxPooling1D()(pretrained_output)
            cls = Lambda(lambda x: x[:, 0, :])(pretrained_output)
            cls_output = Concatenate(axis=-1)([cls, avg, mx])

    cls_output = Dropout(finetune_dropout)(cls_output)

    if not multi_label:
        output = Dense(n_classes,
                       activation="softmax",
                       kernel_initializer=tn(init_range))(cls_output)

    else:
        output = Dense(n_classes,
                       activation="sigmoid",
                       kernel_initializer=tn(init_range))(cls_output)

    finetune_model = Model(
        inputs=[input_tokens, input_positions, input_segments], outputs=output)

    if optimizer == "adam_accumulated":
        opt = ADAM(lr=lr, accum_iters=accum_iters)
    elif optimizer == "lamb_accumulated":
        opt = LAMB(lr=lr, accum_iters=accum_iters)
    else:
        opt = optimizer

    loss = loss_indexer(loss)
    finetune_model.compile(optimizer=opt, loss=loss, metrics=["accuracy"])

    return finetune_model