def bigru_with_attention(max_len=74, emb_dim=32, max_vocab_len=40, W_reg=regularizers.l2(1e-4)): # """Bidirectional GRU with Attention model with the Keras Sequential API""" # Input main_input = Input(shape=(max_len,), dtype='int32', name='main_input') # Embedding layer emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, dropout=0.2, W_regularizer=W_reg)(main_input) # Bi-directional LSTM layer lstm = Bidirectional(GRU(units=128, return_sequences=True))(emb) lstm = Dropout(0.2)(lstm) att_layer, att_score = ScaledDotProductAttention(history_only=True, return_attention=True,)([lstm, lstm, lstm]) att = Flatten()(att_layer) hidden1 = Dense(9472)(att) hidden1 = Dropout(0.5)(hidden1) # Output layer (last fully connected layer) output = Dense(21, activation='softmax', name='output')(hidden1) # Compile model and define optimizer model = Model(input=[main_input], output=[output]) adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) model.compile(optimizer=adam, loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.CategoricalAccuracy(), Evaluator.precision, Evaluator.recall, Evaluator.fmeasure]) return model
def test_sample(self): input_layer = keras.layers.Input( shape=(5, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=4, output_dim=5, mask_zero=True, weights=[ np.array([ [0.1, 0.2, 0.3, 0.4, 0.5], [0.2, 0.3, 0.4, 0.6, 0.5], [0.4, 0.7, 0.2, 0.6, 0.9], [0.3, 0.5, 0.8, 0.9, 0.1], ]), ], name='Embedding', )(input_layer) att_layer = ScaledDotProductAttention(name='Attention')(embed_layer) model = keras.models.Model(inputs=input_layer, outputs=att_layer) model.compile(optimizer='adam', loss='mse') model.summary() inputs = np.array([[1, 2, 3, 1, 0]]) predict = model.predict(inputs)[0] self.assertTrue(np.allclose(predict[0], predict[3])) self.assertTrue( np.allclose( np.asarray([ 0.27883747, 0.45767492, 0.47448885, 0.69199574, 0.47368336 ]), predict[2], ), predict[2])
def test_save_load(self): input_q = keras.layers.Input(shape=(5, 3), name='Input-Q') input_k = keras.layers.Input(shape=(4, 3), name='Input-K') input_v = keras.layers.Input(shape=(4, 6), name='Input-V') attention, weights = ScaledDotProductAttention( return_attention=True, history_only=True, name='Attention', )([input_q, input_k, input_v]) model = keras.models.Model(inputs=[input_q, input_k, input_v], outputs=[attention, weights]) model.compile( optimizer='adam', loss='mse', metrics={}, ) model_path = os.path.join( tempfile.gettempdir(), 'keras_self_att_test_sl_%f.h5' % random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={ 'ScaledDotProductAttention': ScaledDotProductAttention, }, ) model.summary(line_length=120) self.assertTrue(model is not None)
def call(self, inputs, mask=None): if isinstance(inputs, list): q, k, v = inputs else: q = k = v = inputs if isinstance(mask, list): q_mask, k_mask, v_mask = mask else: q_mask = k_mask = v_mask = mask q = K.dot(q, self.Wq) k = K.dot(k, self.Wk) v = K.dot(v, self.Wv) if self.use_bias: q += self.bq k += self.bk v += self.bv if self.activation is not None: q = self.activation(q) k = self.activation(k) v = self.activation(v) scaled_dot_product_attention = ScaledDotProductAttention( history_only=self.history_only, name='%s-Attention' % self.name, ) y = scaled_dot_product_attention( inputs=[ self._reshape_to_batches(q, self.head_num), self._reshape_to_batches(k, self.head_num), self._reshape_to_batches(v, self.head_num), ], mask=[ self._reshape_mask(q_mask, self.head_num), self._reshape_mask(k_mask, self.head_num), self._reshape_mask(v_mask, self.head_num), ], ) self.intensity = self._reshape_attention_from_batches( scaled_dot_product_attention.intensity, self.head_num) self.attention = self._reshape_attention_from_batches( scaled_dot_product_attention.attention, self.head_num) y = self._reshape_from_batches(y, self.head_num) y = K.dot(y, self.Wo) if self.use_bias: y += self.bo if self.activation is not None: y = self.activation(y) if TF_KERAS: # Add shape information to tensor when using `tf.keras` input_shape = [K.int_shape(q), K.int_shape(k), K.int_shape(v)] output_shape = self.compute_output_shape(input_shape) if output_shape[1] is not None: output_shape = (-1, ) + output_shape[1:] y = K.reshape(y, output_shape) return y
def call(self, inputs, mask=None): if isinstance(inputs, list): q, k, v = inputs else: q = k = v = inputs if isinstance(mask, list): q_mask, k_mask, v_mask = mask else: q_mask = k_mask = v_mask = mask q = K.dot(q, self.Wq) k = K.dot(k, self.Wk) v = K.dot(v, self.Wv) if self.use_bias: q += self.bq k += self.bk v += self.bv if self.activation is not None: q = self.activation(q) k = self.activation(k) v = self.activation(v) y, a = ScaledDotProductAttention( history_only=self.history_only, return_attention=True, name='%s-Attention' % self.name, )( inputs=[ self._reshape_to_batches(q, self.head_num), self._reshape_to_batches(k, self.head_num), self._reshape_to_batches(v, self.head_num), ], mask=[ self._reshape_mask(q_mask, self.head_num), self._reshape_mask(k_mask, self.head_num), self._reshape_mask(v_mask, self.head_num), ], ) self.a = a y = self._reshape_from_batches(y, self.head_num) y = K.dot(y, self.Wo) if self.use_bias: y += self.bo if self.activation is not None: y = self.activation(y) input_shape = [K.int_shape(q), K.int_shape(k), K.int_shape(v)] output_shape = self.compute_output_shape(input_shape) if output_shape[1] is not None: output_shape = (-1, ) + output_shape[1:] y = K.reshape(y, output_shape) return y
def test_history(self): input_layer = keras.layers.Input( shape=(5, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=4, output_dim=5, mask_zero=True, weights=[ np.asarray([ [0.1, 0.2, 0.3, 0.4, 0.5], [0.2, 0.3, 0.4, 0.6, 0.5], [0.4, 0.7, 0.2, 0.6, 0.9], [0.3, 0.5, 0.8, 0.9, 0.1], ]), ], name='Embedding', )(input_layer) att_layer, att_weights = ScaledDotProductAttention( history_only=True, return_attention=True, name='Attention', )([embed_layer, embed_layer, embed_layer]) model = keras.models.Model(inputs=input_layer, outputs=[att_layer, att_weights]) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() inputs = np.array([[1, 2, 3, 1, 0]]) predicts = model.predict(inputs) results, weights = predicts[0][0], predicts[1][0] self.assertFalse(np.allclose(results[0], results[3])) self.assertTrue( np.allclose( np.asarray([0.2, 0.3, 0.4, 0.6, 0.5]), results[0], ), results[2]) for i in range(4): for j in range(5): if j > i: self.assertEqual(0.0, weights[i][j]) else: self.assertLess(0.0, weights[i][j])
def call(self, inputs, mask=None): if isinstance(inputs, list): q, k, v = inputs else: q = k = v = inputs if isinstance(mask, list): q_mask, k_mask, v_mask = mask else: q_mask = k_mask = v_mask = mask feature_dim = K.shape(v)[-1] head_dim = feature_dim // self.head_num q = K.dot(q, self.Wq) k = K.dot(k, self.Wk) v = K.dot(v, self.Wv) if self.use_bias: q += self.bq k += self.bk v += self.bv if self.activation is not None: q = self.activation(q) k = self.activation(k) v = self.activation(v) outputs = [] for i in range(self.head_num): begin, end = i * head_dim, (i + 1) * head_dim outputs.append( ScaledDotProductAttention( history_only=self.history_only, name='%s-Att-%d' % (self.name, i + 1), )( inputs=[ q[:, :, begin:end], k[:, :, begin:end], v[:, :, begin:end], ], mask=[q_mask, k_mask, v_mask], )) y = K.dot(K.concatenate(outputs), self.Wo) if self.use_bias: y += self.bo if self.activation is not None: y = self.activation(y) return y
def call(self, inputs, mask=None): if isinstance(inputs, list): q, k, v = inputs else: q = k = v = inputs if isinstance(mask, list): q_mask, k_mask, v_mask = mask else: q_mask = k_mask = v_mask = mask q = K.dot(q, self.Wq) k = K.dot(k, self.Wk) v = K.dot(v, self.Wv) if self.use_bias: q += self.bq k += self.bk v += self.bv if self.activation is not None: q = self.activation(q) k = self.activation(k) v = self.activation(v) y = ScaledDotProductAttention( history_only=self.history_only, name='%s-Attention' % self.name, )( inputs=[ self._reshape_to_batches(q, self.head_num), self._reshape_to_batches(k, self.head_num), self._reshape_to_batches(v, self.head_num), ], mask=[ self._reshape_mask(q_mask, self.head_num), self._reshape_mask(k_mask, self.head_num), self._reshape_mask(v_mask, self.head_num), ], ) y = self._reshape_from_batches(y, self.head_num) y = K.dot(y, self.Wo) if self.use_bias: y += self.bo if self.activation is not None: y = self.activation(y) y = K.reshape(y, (-1, 512, 768)) return y
def build_model(max_length, loaded_model=None, fine_tune_model=False, embedding_matrix=None, transformer_depth=8, transformer_heads=8, l2_penalty=None, embedding_dropout=0.6, transformer_dropout=0.1, classifier_dropout=0.1, transformer_output_handling="flatten", print_info=False, train_lm=True): original_model = None if loaded_model: # load the specified model original_model = load_model(loaded_model, custom_objects={ "perplexity": perplexity, "lm_accuracy": lm_accuracy, "SeqSelfAttention": SeqSelfAttention, "ScaledDotProductAttention": ScaledDotProductAttention }) # regularizer for embedding layer l2_regularizer = l2(l2_penalty) if l2_penalty else None # input encoded as integers raw_input = Input(shape=(max_length, ), name="input") # embedding layer, initialised with embedding matrix weights for now embedding_weights = [ original_model.get_layer(name="word_embedding").get_weights()[0] if loaded_model else embedding_matrix ] embedding_layer = ReusableEmbedding( input_dim=(embedding_matrix[0] if type(embedding_matrix) == tuple else embedding_matrix.shape[0]), output_dim=(embedding_matrix[1] if type(embedding_matrix) == tuple else embedding_matrix.shape[1]), input_length=max_length, name="word_embedding", weights=(None if type(embedding_matrix) == tuple and not loaded_model else embedding_weights), embeddings_regularizer=l2_regularizer) # "transpose" of embedding matrix to map back to vocabulary if loaded_model: output_weights = original_model.get_layer( name="word_prediction_logits").get_weights() output_layer = TiedOutputEmbedding( projection_regularizer=l2_regularizer, projection_dropout=embedding_dropout, name="word_prediction_logits", weights=output_weights) else: output_layer = TiedOutputEmbedding( projection_regularizer=l2_regularizer, projection_dropout=embedding_dropout, name="word_prediction_logits") # transformer as taken from here: https://github.com/kpot/keras-transformer/blob/master/example/models.py if loaded_model: position_weights = original_model.get_layer( name="position_embedding").get_weights() position_embedding = TransformerCoordinateEmbedding( max_transformer_depth=1, name="position_embedding", weights=position_weights) else: position_embedding = TransformerCoordinateEmbedding( max_transformer_depth=1, name="position_embedding") transformer_input, embedding_matrix = embedding_layer(raw_input) transformer_output = position_embedding(transformer_input, step=0) for i in range(transformer_depth): block_name = "transformer" + str(i) # define transformer block transformer_block = TransformerBlock( name=block_name, num_heads=transformer_heads, residual_dropout=transformer_dropout, attention_dropout=transformer_dropout, use_masking=True, vanilla_wiring=True) # build the layers in the block because apparently you have to do that if loaded_model: if i == 0: transformer_block.attention_layer.build( original_model.get_layer( "position_embedding").output_shape) else: transformer_block.attention_layer.build( original_model.get_layer( "transformer{}_normalization2".format(i - 1)).output_shape) transformer_block.norm1_layer.build( original_model.get_layer(block_name + "_self_attention").output_shape) transformer_block.norm2_layer.build( original_model.get_layer(block_name + "_normalization1").output_shape) transformer_block.transition_layer.build( original_model.get_layer(block_name + "_normalization1").output_shape) # set weights for all the contained layers manually transformer_block.attention_layer.set_weights( original_model.get_layer( name=(block_name + "_self_attention")).get_weights()) transformer_block.norm1_layer.set_weights( original_model.get_layer( name=(block_name + "_normalization1")).get_weights()) transformer_block.norm2_layer.set_weights( original_model.get_layer( name=(block_name + "_normalization2")).get_weights()) transformer_block.transition_layer.set_weights( original_model.get_layer(name=(block_name + "_transition")).get_weights()) # pass output of last layer through transformer transformer_output = transformer_block(transformer_output) if print_info: logger.debug("transformer_output shape: {}".format( K.int_shape(transformer_output[0] if fine_tune_model else transformer_output))) # nothing special to load for softmax softmax_layer = Softmax(name="word_predictions") lm_output_logits = output_layer([transformer_output, embedding_matrix]) lm_output = softmax_layer(lm_output_logits) if print_info: logger.debug("lm_output_logits shape: {}".format( K.int_shape(lm_output_logits))) logger.debug("output shape: {}".format(K.int_shape(lm_output))) if not fine_tune_model: m = Model(inputs=raw_input, outputs=lm_output) return m loaded_layer_names = [] if loaded_model: loaded_layer_names = [layer.name for layer in original_model.layers] # for concatenation transformer outputs early flatten = Flatten(name="flatten_transformer_output") max_pooling = Lambda(lambda x: K.max(x, axis=1), name="max_pooling") mean_pooling = Lambda(lambda x: K.mean(x, axis=1), name="mean_pooling") self_attention = SeqSelfAttention(name="self_attention") scaled_dot_attention = ScaledDotProductAttention( name="scaled_dot_attention") dropout = Dropout(rate=classifier_dropout, name="classifier_dropout") options = { "flatten": flatten, "max_pooling": max_pooling, "mean_pooling": mean_pooling, "self_attention": self_attention, "scaled_dot_attention": scaled_dot_attention } dense = Dense(2, activation=None, name="dense") if loaded_model and "dense" in loaded_layer_names: layer = original_model.get_layer(name="dense") dense.build(layer.input_shape) dense.set_weights(layer.get_weights()) pooling_layer = options[transformer_output_handling] if loaded_model and transformer_output_handling in loaded_layer_names: layer = original_model.get_layer(name=transformer_output_handling) pooling_layer.build(layer.input_shape) pooling_layer.set_weights(layer.get_weights()) if "attention" in transformer_output_handling: handled_output = flatten(pooling_layer(transformer_output)) else: handled_output = pooling_layer(transformer_output) classifier_logits = dense(dropout(handled_output)) classifier_output = Softmax( name="classifier_prediction")(classifier_logits) if train_lm: m = Model(inputs=raw_input, outputs=[lm_output, classifier_output]) else: m = Model(inputs=raw_input, outputs=classifier_output) # m = Model(inputs=raw_input, outputs=lm_output) return m