def __init__(self, config, pretrained_embedding): self._input = tf.placeholder(dtype=tf.int32,shape=[None,None],name='input') self._target = tf.placeholder(dtype=tf.int32,shape=[None],name='target') self.batch_size = config['batch_size'] self.num_steps = config['num_steps'] self.embed_size = config['embed_size'] self.size = config['hidden_size'] self._lr = config['lr'] self.num_classes = config['num_classes'] self.keep_prob = tf.Variable(config['keep_prob'],trainable=False) self.combine_mode = config['combine_mode'] self.weight_decay = config['weight_decay'] # # outputs = LSTMEncoderWithEmbedding(self._input,self.embed_size,self.size,\ # config['vocab_size'],self.num_steps,\ # self.keep_prob,embedding=pretrained_embedding,\ # num_layers=config['num_layers'],\ # variational_dropout=True,\ # combine_mode='last').get_output() embed = Embedding(config['vocab_size']+1, self.embed_size)(self._input) outputs = tf.nn.dropout(embed,keep_prob=self.keep_prob) # outputs = Bidirectional(CuDNNLSTM(self.size,return_sequences=True))(outputs) # outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob) outputs = Bidirectional(CuDNNLSTM(self.size,return_sequences=True))(outputs) self.size = int(outputs.get_shape().as_list()[-1]) if self.combine_mode =='weight': outputs = tf.reshape(outputs,[-1,self.size]) weights = Dense(1,activation='tanh')(outputs) outputs = tf.multiply(outputs,weights) outputs = tf.reshape(outputs,[-1,self.num_steps,self.size]) outputs = tf.reduce_sum(outputs,axis=1) elif self.combine_mode =='last': outputs = outputs[:,-1,:] elif self.combine_mode =='all': weights = Dense(1,activation='tanh')(outputs) outputs_weighted = tf.multiply(outputs,weights) outputs_weighted = tf.reshape(outputs_weighted,[-1,self.num_steps,2*self.size]) outputs_weighted = tf.reduce_sum(outputs_weighted,axis=1) outputs_last = outputs[:,-1,:] outputs_mean = tf.reduce_mean(outputs,axis=1) outputs_max = tf.reduce_max(outputs,axis=1) outputs_min = tf.reduce_min(outputs,axis=1) outputs = tf.concat([outputs_last,outputs_mean,outputs_max,outputs_min,outputs_weighted],axis=-1) # outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob) embed_avg = tf.reduce_mean(embed,axis=1) # embed_max = tf.reduce_max(embed,axis=1) # embed_min = tf.reduce_min(embed,axis=1) # outputs = tf.concat([outputs,embed_avg,embed_min,embed_max],axis=-1) outputs = tf.concat([outputs,embed_avg] ,axis=-1) # outputs = tf.contrib.layers.fully_connected(outputs,self.size) # outputs = tf.nn.dropout(outputs,keep_prob=self.keep_prob) # softmax_w = tf.get_variable("softmax_w", [self.size, self.num_classes], dtype=tf.float32) # softmax_b = tf.get_variable("softmax_b", [self.num_classes], dtype=tf.float32) # logits = tf.matmul(outputs, softmax_w) + softmax_b logits = Dense(self.num_classes,activation=None)(outputs) # update the cost variables loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self._target,logits=logits) self.l2_loss = sum(tf.nn.l2_loss(tf_var) for tf_var in tf.trainable_variables() ) self._cost = cost = tf.reduce_mean(loss) + self.weight_decay*self.l2_loss self._lr = tf.Variable(self._lr, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config['max_grad_norm']) optimizer = tf.train.AdamOptimizer(self._lr) # optimizer = tf.train.GradientDescentOptimizer(self._lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) self._new_lr = tf.placeholder(tf.float32, shape=[], name="new_learning_rate") self._lr_update = tf.assign(self._lr, self._new_lr) self.predicted_class = tf.cast(tf.argmax(tf.nn.softmax(logits),axis=-1),tf.int32)
def cnn_bilstm_model(pooling_size=3, nb_filters=32, filters_length=10, lstm_units=32, attention_size=50): '''build model''' input = Input(shape=(None, ), dtype='int8') embedding_layer = Embedding(len(encoding_vectors), len(encoding_vectors[0]), weights=[encoding_vectors], input_length=None, trainable=False) embedding_output = embedding_layer(input) with tf.name_scope('first_cnn'): # first cnn layer cnn_output = Dropout(0.2)( MaxPooling1D(pool_length=pooling_size, stride=pooling_size)( Convolution1D(nb_filters, filters_length, border_mode='same', activation='relu', input_shape=(None, 24))(embedding_output)) # output shape is in (batch_size, steps, filters), normalizing over the feature axis which is -1 ) with tf.name_scope('Second_cnn'): # stack another cnn layer on top cnn_output = Dropout(0.2)(MaxPooling1D( pool_length=pooling_size, stride=pooling_size)(Convolution1D(nb_filters, filters_length, border_mode='same', activation='relu')(cnn_output))) with tf.name_scope('Third_cnn'): # stack another cnn layer on top cnn_output = Dropout(0.2)(MaxPooling1D( pool_length=pooling_size, stride=pooling_size)(Convolution1D(nb_filters, filters_length, border_mode='same', activation='relu')(cnn_output))) with tf.name_scope('Fourth_cnn'): # stack another cnn layer on top cnn_output = Dropout(0.2)(MaxPooling1D( pool_length=pooling_size, stride=pooling_size)(Convolution1D(nb_filters, filters_length, border_mode='same', activation='relu')(cnn_output))) with tf.name_scope('bilstm_layer'): lstm_output = Bidirectional( LSTM(lstm_units, dropout=0.1, return_sequences=True, input_shape=(None, nb_filters)))(cnn_output) # output shape: (batch_size, time steps, hidden size=2*nb_filters) hidden_size = lstm_output.get_shape()[2].value print('hidden size:', hidden_size) with tf.name_scope('attention_module'): # [batch_size, time_steps, attention_size] context_weights = Dense(attention_size, activation='tanh', kernel_initializer=random_normal(), bias_initializer=random_normal())(lstm_output) # [batch_size, time_steps] scores = Lambda(lambda x: K.batch_flatten(x))( Dense(1, kernel_initializer=random_normal(), use_bias=False)(context_weights)) # softmax probability distribution, [batch_size, sequence_length] attention_weights = Lambda(lambda x: K.expand_dims(x, axis=-1))( Activation("softmax")(scores)) # Multiply() behaves exactly as tf.multiply() which supports shape broadcasting, so its output_shape is [batch_size, time_steps, hidden_size] # Lambda(lambda x: K.sum(x, axis=1, keepdims=False)) is equivalent to tf.reduce_sum(axis=1) # [batch_size, hidden] output = Lambda(lambda x: K.sum(x, axis=1, keepdims=False))( Multiply()([lstm_output, attention_weights])) preds = Dense(nb_classes, activation='softmax')(output) model = Model(inputs=[input], outputs=preds) from keras import optimizers optim = optimizers.adam(lr=0.0001) # optim = optimizers.sgd(lr=0.001) model.compile(loss='kld', optimizer=optim, metrics=['acc']) return model
maxLen = len(max(x_train, key=len)) units = 128 _input = Input(shape=(maxLen, ), dtype='float32') # get the embedding layer embedding_layer = Embedding(input_dim=len(word_index) + 1, output_dim=EMBEDDING_DIM, weights=[embedding_matrix], input_length=maxLen, trainable=False) embedded = embedding_layer(_input) activations = Bidirectional(LSTM(int(units / 2), return_sequences=True), name='bidirectional_lstm')(embedded) print(activations.get_shape()) # (None, 229, 256) # activations = LSTM(units, return_sequences = True)(embedded) # compute importance for each step attention = Dense(1, activation='tanh')(activations) # print(attention.get_shape()) # (None, 229, 1) attention = Flatten()(attention) # print(attention.get_shape()) # (None, 229) attention = Activation('softmax')(attention) # print(attention.get_shape()) # (None, 229) attention = RepeatVector(units)(attention) # print(attention.get_shape()) # (None, 128, 229) attention = Permute([2, 1])(attention) # print(attention.get_shape()) # (None, 229, 128) # apply the attention