def __init__(self, config, pretrained_embedding):
    self._input         = tf.placeholder(dtype=tf.int32,shape=[None,None],name='input')
    self._target        = tf.placeholder(dtype=tf.int32,shape=[None],name='target')
    self.batch_size     = config['batch_size']
    self.num_steps      = config['num_steps']
    self.embed_size     = config['embed_size']
    self.size           = config['hidden_size']
    self._lr            = config['lr']
    self.num_classes    = config['num_classes']
    self.keep_prob      = tf.Variable(config['keep_prob'],trainable=False)
    self.combine_mode   = config['combine_mode']
    self.weight_decay   = config['weight_decay']


    #
    # outputs = LSTMEncoderWithEmbedding(self._input,self.embed_size,self.size,\
    #                          config['vocab_size'],self.num_steps,\
    #                          self.keep_prob,embedding=pretrained_embedding,\
    #                          num_layers=config['num_layers'],\
    #                          variational_dropout=True,\
    #                          combine_mode='last').get_output()

    embed = Embedding(config['vocab_size']+1, self.embed_size)(self._input)
    outputs = tf.nn.dropout(embed,keep_prob=self.keep_prob)
    # outputs = Bidirectional(CuDNNLSTM(self.size,return_sequences=True))(outputs)
    # outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob)
    outputs = Bidirectional(CuDNNLSTM(self.size,return_sequences=True))(outputs)

    self.size = int(outputs.get_shape().as_list()[-1])
    if self.combine_mode =='weight':
        outputs = tf.reshape(outputs,[-1,self.size])
        weights = Dense(1,activation='tanh')(outputs)
        outputs = tf.multiply(outputs,weights)
        outputs = tf.reshape(outputs,[-1,self.num_steps,self.size])
        outputs = tf.reduce_sum(outputs,axis=1)
    elif self.combine_mode =='last':
        outputs = outputs[:,-1,:]
    elif self.combine_mode =='all':
        weights = Dense(1,activation='tanh')(outputs)
        outputs_weighted = tf.multiply(outputs,weights)
        outputs_weighted = tf.reshape(outputs_weighted,[-1,self.num_steps,2*self.size])
        outputs_weighted = tf.reduce_sum(outputs_weighted,axis=1)
        outputs_last = outputs[:,-1,:]
        outputs_mean = tf.reduce_mean(outputs,axis=1)
        outputs_max  = tf.reduce_max(outputs,axis=1)
        outputs_min  = tf.reduce_min(outputs,axis=1)
        outputs = tf.concat([outputs_last,outputs_mean,outputs_max,outputs_min,outputs_weighted],axis=-1)
#     outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob)

    embed_avg = tf.reduce_mean(embed,axis=1)
#     embed_max = tf.reduce_max(embed,axis=1)
#     embed_min = tf.reduce_min(embed,axis=1)
#     outputs = tf.concat([outputs,embed_avg,embed_min,embed_max],axis=-1)
    outputs = tf.concat([outputs,embed_avg]   ,axis=-1)
    # outputs = tf.contrib.layers.fully_connected(outputs,self.size)
#     outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob)
    # softmax_w = tf.get_variable("softmax_w", [self.size, self.num_classes], dtype=tf.float32)
    # softmax_b = tf.get_variable("softmax_b", [self.num_classes], dtype=tf.float32)
    # logits    = tf.matmul(outputs, softmax_w) + softmax_b
    logits = Dense(self.num_classes,activation=None)(outputs)


    # update the cost variables
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self._target,logits=logits)
    self.l2_loss =  sum(tf.nn.l2_loss(tf_var)
        for tf_var in tf.trainable_variables()
        )
    self._cost = cost = tf.reduce_mean(loss) + self.weight_decay*self.l2_loss

    self._lr = tf.Variable(self._lr, trainable=False)
    tvars    = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                      config['max_grad_norm'])
    optimizer = tf.train.AdamOptimizer(self._lr)
#     optimizer = tf.train.GradientDescentOptimizer(self._lr)

    self._train_op = optimizer.apply_gradients(zip(grads, tvars))

    self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate")
    self._lr_update = tf.assign(self._lr, self._new_lr)
    self.predicted_class = tf.cast(tf.argmax(tf.nn.softmax(logits),axis=-1),tf.int32)
Esempio n. 2
0
def cnn_bilstm_model(pooling_size=3,
                     nb_filters=32,
                     filters_length=10,
                     lstm_units=32,
                     attention_size=50):
    '''build model'''
    input = Input(shape=(None, ), dtype='int8')
    embedding_layer = Embedding(len(encoding_vectors),
                                len(encoding_vectors[0]),
                                weights=[encoding_vectors],
                                input_length=None,
                                trainable=False)
    embedding_output = embedding_layer(input)
    with tf.name_scope('first_cnn'):
        # first cnn layer
        cnn_output = Dropout(0.2)(
            MaxPooling1D(pool_length=pooling_size, stride=pooling_size)(
                Convolution1D(nb_filters,
                              filters_length,
                              border_mode='same',
                              activation='relu',
                              input_shape=(None, 24))(embedding_output))
            # output shape is in (batch_size, steps, filters), normalizing over the feature axis which is -1
        )
    with tf.name_scope('Second_cnn'):
        # stack another cnn layer on top
        cnn_output = Dropout(0.2)(MaxPooling1D(
            pool_length=pooling_size,
            stride=pooling_size)(Convolution1D(nb_filters,
                                               filters_length,
                                               border_mode='same',
                                               activation='relu')(cnn_output)))

    with tf.name_scope('Third_cnn'):
        # stack another cnn layer on top
        cnn_output = Dropout(0.2)(MaxPooling1D(
            pool_length=pooling_size,
            stride=pooling_size)(Convolution1D(nb_filters,
                                               filters_length,
                                               border_mode='same',
                                               activation='relu')(cnn_output)))

    with tf.name_scope('Fourth_cnn'):
        # stack another cnn layer on top
        cnn_output = Dropout(0.2)(MaxPooling1D(
            pool_length=pooling_size,
            stride=pooling_size)(Convolution1D(nb_filters,
                                               filters_length,
                                               border_mode='same',
                                               activation='relu')(cnn_output)))

    with tf.name_scope('bilstm_layer'):
        lstm_output = Bidirectional(
            LSTM(lstm_units,
                 dropout=0.1,
                 return_sequences=True,
                 input_shape=(None, nb_filters)))(cnn_output)
        # output shape: (batch_size, time steps, hidden size=2*nb_filters)

    hidden_size = lstm_output.get_shape()[2].value
    print('hidden size:', hidden_size)

    with tf.name_scope('attention_module'):
        # [batch_size, time_steps, attention_size]
        context_weights = Dense(attention_size,
                                activation='tanh',
                                kernel_initializer=random_normal(),
                                bias_initializer=random_normal())(lstm_output)
        # [batch_size, time_steps]
        scores = Lambda(lambda x: K.batch_flatten(x))(
            Dense(1, kernel_initializer=random_normal(),
                  use_bias=False)(context_weights))

        # softmax probability distribution, [batch_size, sequence_length]
        attention_weights = Lambda(lambda x: K.expand_dims(x, axis=-1))(
            Activation("softmax")(scores))

        # Multiply() behaves exactly as tf.multiply() which supports shape broadcasting, so its output_shape is [batch_size, time_steps, hidden_size]
        # Lambda(lambda x: K.sum(x, axis=1, keepdims=False)) is equivalent to tf.reduce_sum(axis=1)
        # [batch_size, hidden]
        output = Lambda(lambda x: K.sum(x, axis=1, keepdims=False))(
            Multiply()([lstm_output, attention_weights]))

    preds = Dense(nb_classes, activation='softmax')(output)
    model = Model(inputs=[input], outputs=preds)
    from keras import optimizers
    optim = optimizers.adam(lr=0.0001)
    # optim = optimizers.sgd(lr=0.001)
    model.compile(loss='kld', optimizer=optim, metrics=['acc'])
    return model
maxLen = len(max(x_train, key=len))
units = 128

_input = Input(shape=(maxLen, ), dtype='float32')
# get the embedding layer
embedding_layer = Embedding(input_dim=len(word_index) + 1,
                            output_dim=EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxLen,
                            trainable=False)

embedded = embedding_layer(_input)

activations = Bidirectional(LSTM(int(units / 2), return_sequences=True),
                            name='bidirectional_lstm')(embedded)
print(activations.get_shape())  # (None, 229, 256)
# activations = LSTM(units, return_sequences = True)(embedded)

# compute importance for each step
attention = Dense(1, activation='tanh')(activations)
# print(attention.get_shape())  # (None, 229, 1)
attention = Flatten()(attention)
# print(attention.get_shape())  # (None, 229)
attention = Activation('softmax')(attention)
# print(attention.get_shape())  # (None, 229)
attention = RepeatVector(units)(attention)
# print(attention.get_shape())  # (None, 128, 229)
attention = Permute([2, 1])(attention)
# print(attention.get_shape())  # (None, 229, 128)

# apply the attention