def test_fit_self(self): input_layer = keras.layers.Input( shape=(2, 3), name='Input', ) att_layer = MultiHeadAttention( head_num=3, name='Multi-Head-1', )(input_layer) dense_layer = keras.layers.Dense(units=3, name='Dense-1')(att_layer) att_layer = MultiHeadAttention( head_num=3, name='Multi-Head-2', )(dense_layer) output_layer = keras.layers.Dense(units=3, name='Dense-2')(att_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary() def _generator(batch_size=32): while True: inputs = np.random.random((batch_size, 2, 3)) outputs = np.asarray([[[0.0, -0.1, 0.2]] * 2] * batch_size) yield inputs, outputs model.fit_generator( generator=_generator(), steps_per_epoch=1000, epochs=10, validation_data=_generator(), validation_steps=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) ], ) model_path = os.path.join(tempfile.gettempdir(), 'test_save_load_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model(model_path, custom_objects={ 'MultiHeadAttention': MultiHeadAttention, }) for inputs, _ in _generator(batch_size=3): predicts = model.predict(inputs) expect = np.asarray([[[0.0, -0.1, 0.2]] * 2] * 3) actual = np.round(predicts, decimals=1) self.assertTrue(np.allclose(expect, actual), (expect, actual)) break
def local_context_learning(input_length, input_dim, output_dim, hidden_dim, filters_num, kernel_val, learning_rate, drop_rate): basic_input = Input(shape=(input_length, input_dim)) label_input = Input(shape=(1, )) weighted_input = adding_weight(input_length, input_dim)([basic_input, label_input]) rnn_output = GRU(units=hidden_dim, return_sequences=True)(weighted_input) rnn_att = SeqSelfAttention(attention_activation='sigmoid')(rnn_output) cnn_output = Conv1D(filters=filters_num, kernel_size=kernel_val, padding="same")(weighted_input) cnn_output_reformat = Dense(hidden_dim)(cnn_output) cnn_att = SeqSelfAttention( attention_activation='sigmoid')(cnn_output_reformat) new_value = Concatenate(axis=1)([rnn_att, cnn_att]) new_keys = Lambda(lambda x: ones_like(x))(new_value) new_result = MultiHeadAttention(head_num=2)( [weighted_input, new_keys, new_value]) result = Flatten()(new_result) result_fix = Dropout(rate=drop_rate)(result) output = Dense(output_dim)(result_fix) fixed_output = Activation(activation='sigmoid')(output) model = Model([basic_input, label_input], fixed_output) ada = adam(lr=learning_rate) model.compile(optimizer=ada, loss='categorical_crossentropy') return model
def build_model(input_shape): emotive_params = 266 n_feature_maps = 64 main_input = keras.layers.Input(input_shape) input_layer = keras.layers.Input((input_shape[1], input_shape[2])) lstm_layer = keras.layers.LSTM(emotive_params, return_sequences=True)(input_layer) att_layer = MultiHeadAttention( head_num=14, name='Multi-Head', )(lstm_layer) lstm_layer = keras.layers.LSTM(emotive_params, return_sequences=True)(att_layer) gap_layerX = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) # See architecture cnn_model = keras.layers.TimeDistributed( keras.models.Model(inputs=input_layer, outputs=gap_layerX))(main_input) lstm_layer = keras.layers.LSTM(n_feature_maps, return_sequences=True)(cnn_model) gap_layerX = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) output_layer = keras.layers.Dense(n_feature_maps, activation='relu')(gap_layerX) output_layer = keras.layers.Dense(2, activation='softmax')(output_layer) model = keras.models.Model(inputs=main_input, outputs=output_layer) model.summary() return model
def __init__(self, length, word_num): self.input_length = length self.batch_size = 512 * 10 self.word_num = word_num self.input_dimension = 128 self.lstm_dim = 64 self.middle_num = 2 self.output_dimension = 64 #self.input_embedding=keras.layers.Embedding(input_dim=self.word_num,output_dim=self.input_dimension,weights=[weights]) self.input_embedding = keras.layers.Embedding( input_dim=self.word_num, output_dim=self.input_dimension) self.bilstm = keras.layers.Bidirectional( keras.layers.LSTM(self.lstm_dim / 2, return_sequences=True)) self.middle_layer = [] for i in range(0, self.middle_num): self.middle_layer.append( MultiHeadAttention(head_num=4, name="middle_layer_{}".format(i))) #self.outter_layer=keras.layers.Dense(self.output_dimension, activation="tanh", input_dim=self.input_length*self.lstm_dim, use_bias= True,name="ouput_layer") self.outter_layer = keras.layers.Dense(self.output_dimension, input_dim=self.input_length * self.lstm_dim, use_bias=True, name="ouput_layer")
def build_model(input_shape): input_layer = keras.layers.Input(input_shape) conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer) conv1 = keras.layers.normalization.BatchNormalization()(conv1) conv1 = keras.layers.Activation(activation='relu')(conv1) conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1) conv2 = keras.layers.normalization.BatchNormalization()(conv2) conv2 = keras.layers.Activation('relu')(conv2) conv3 = keras.layers.Conv1D(128, kernel_size=3, padding='same')(conv2) conv3 = keras.layers.normalization.BatchNormalization()(conv3) conv3 = keras.layers.Activation('relu')(conv3) # Attention lstm_layer = keras.layers.LSTM(64, return_sequences=True)(conv3) att_layer = MultiHeadAttention(head_num=8)(lstm_layer) lstm_layer = keras.layers.LSTM(64, return_sequences=True)(att_layer) gap_layer = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) output_layer = keras.layers.Dense(2, activation='softmax')(gap_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary() return model
def test_compare_brute(self): for case in range(10): batch_size = np.random.randint(1, 10) token_num = np.random.randint(2, 10) head_num = np.random.randint(1, 5) feature_dim = np.random.randint(1, 5) * head_num seq_len = np.random.randint(1, 10) weights = [] for i in range(8): if i % 2 == 0: weights.append(np.random.random( (feature_dim, feature_dim))) else: weights.append(np.random.random((feature_dim, ))) input_q_layer = keras.layers.Input(shape=(None, )) input_kv_layer = keras.layers.Input(shape=(None, )) embed_q_layer = keras.layers.Embedding( input_dim=token_num, output_dim=feature_dim, mask_zero=True, )(input_q_layer) embed_kv_layer = keras.layers.Embedding( input_dim=token_num, output_dim=feature_dim, mask_zero=True, )(input_kv_layer) att_layer = MultiHeadAttention( head_num=head_num, weights=weights, name='Multi-Head-1', )([embed_q_layer, embed_kv_layer, embed_kv_layer]) att_brute_layer = MultiHeadAttentionBrute( head_num=head_num, weights=weights, name='Multi-Head-2', )([embed_q_layer, embed_kv_layer, embed_kv_layer]) model = keras.models.Model(inputs=[input_q_layer, input_kv_layer], outputs=[att_layer, att_brute_layer]) model.compile(optimizer='adam', loss='mse', metrics={}) if case == 0: model.summary(line_length=120) data_q = np.random.randint(low=0, high=token_num, size=(batch_size, seq_len)) data_kv = np.random.randint(low=0, high=token_num, size=(batch_size, seq_len)) for i in range(batch_size): if np.sum(data_q[i]) == 0: data_q[i][np.random.randint( low=0, high=seq_len)] = np.random.randint(low=1, high=token_num) if np.sum(data_kv[i]) == 0: data_kv[i][np.random.randint( low=0, high=seq_len)] = np.random.randint(low=1, high=token_num) predicts = model.predict([data_q, data_kv]) self.assertTrue(np.allclose(predicts[0], predicts[1]), predicts)
def getModel(voc_size, embedding_matrix): input_layer = Input(name='Input', shape=(MAX_WORDS_TEXT, ), dtype="float32") embedding_layer = Embedding(voc_size, WORD_EMBEDDINGS_SIZE, weights=[embedding_matrix], input_length=MAX_WORDS_TEXT, trainable=True)(input_layer) rnn_layer = Bidirectional(MultiplicativeLSTM( WORD_EMBEDDINGS_SIZE, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, activation='pentanh', recurrent_activation='pentanh'), merge_mode='concat')(embedding_layer) attention_layer = MultiHeadAttention(head_num=4)(rnn_layer) flatten = Flatten()(attention_layer) final = Dense(14, activation='sigmoid')(flatten) mdl = Model(inputs=input_layer, outputs=final) mdl.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', auc_roc]) print(mdl.summary()) return mdl
def _attention_builder(x): return MultiHeadAttention( head_num=head_num, activation=activation, history_only=history_only, trainable=trainable, name=name, )(x)
def test_fit_zeros(self): def _leaky_relu(x): return keras.activations.relu(x, alpha=0.01) input_layer = keras.layers.Input( shape=(2, 3), name='Input', ) norm_layer = LayerNormalization( name='Layer-Normalization-1', trainable=False, )(input_layer) att_layer = MultiHeadAttention( head_num=3, activation=_leaky_relu, name='Multi-Head-Attentions')(norm_layer) dense_layer = keras.layers.Dense(units=3, name='Dense-1')(att_layer) norm_layer = LayerNormalization( name='Layer-Normalization-2', trainable=False, )(dense_layer) dense_layer = keras.layers.Dense(units=3, name='Dense-2')(norm_layer) model = keras.models.Model( inputs=input_layer, outputs=dense_layer, ) model.compile( optimizer=keras.optimizers.Adam(lr=1e-3), loss='mse', metrics={}, ) model.summary() def _generator_zeros(batch_size=32): while True: batch_inputs = np.zeros((batch_size, 2, 3)) batch_outputs = np.asarray([[[0.0, -0.1, 0.2]] * 2] * batch_size) yield batch_inputs, batch_outputs model.fit_generator( generator=_generator_zeros(), steps_per_epoch=100, epochs=100, validation_data=_generator_zeros(), validation_steps=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, min_delta=1e-4) ], ) for inputs, _ in _generator_zeros(batch_size=3): predicts = model.predict(inputs) expect = np.round(np.asarray([[[0.0, -0.1, 0.2]] * 2] * 3), decimals=1) actual = np.round(predicts, decimals=1) self.assertTrue(np.allclose(expect, actual), (expect, actual)) break
def character_network(lstm_layers: int, lstm_units: int, char_lstm_units: int, value_first: int, value_second: int, label_embedding_dim: int, max_length, max_word_length, pos_tag: bool, character: bool, attention, custom_layer: bool) -> Model: word_input = Input(shape=(max_length, 200), name='word_input') input_list = [word_input] lstm_list = [word_input] if pos_tag: pos_input = Input(shape=(max_length, 20), name='pos_input') input_list += [pos_input] lstm_list += [pos_input] if character: char_input = Input(shape=(max_length, max_word_length), name='char_input') char_embedding = TimeDistributed(Embedding(input_dim=number_of_charachters, output_dim=25))(char_input) char_lstm = TimeDistributed(Bidirectional(LSTM(char_lstm_units, return_sequences=False)))(char_embedding) lstm_list = input_list + [char_lstm] input_list += [char_input] if len(input_list) == 1: lstm_input = word_input else: lstm_input = Concatenate()(lstm_list) word_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(lstm_input) for i in range(lstm_layers-1): word_lstm = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(word_lstm) if attention: attention = MultiHeadAttention(head_num=8)(word_lstm) dense_first = TimeDistributed(Dense(value_first, activation=None))(attention) else: dense_first = TimeDistributed(Dense(value_first, activation=None))(word_lstm) crf_layer = ChainCRF(name='first_crf') first_output = crf_layer(dense_first) argmax = Lambda(lambda x: K.argmax(x))(first_output) label_embedding = Embedding(input_dim=value_first+1, output_dim=label_embedding_dim, trainable=True)(argmax) # print(entity_aware_matrix.shape) final_input = Concatenate(axis=2)([word_lstm, label_embedding]) # print(second_input.shape) if custom_layer: entity_aware = EntityAwareDecodingLayer()([word_lstm, argmax, label_embedding]) entity_aware_matrix = MyRepeatVector(max_length, 2*lstm_units+label_embedding_dim)(entity_aware) final_input = Concatenate(axis=2)([final_input, entity_aware_matrix]) # print(final_input.shape) dense_second = TimeDistributed(Dense(value_second, activation=None))(final_input) second_crf = ChainCRF(name='second_crf') second_output = second_crf(dense_second) model = Model(inputs=input_list, outputs=[first_output, second_output]) algorithm = Adam(lr=0.0001, decay=0, beta_1=0.9, beta_2=0.999) losses = { "first_crf": crf_layer.loss, "second_crf": second_crf.loss, } model.compile(loss=losses, optimizer=algorithm, metrics=['accuracy']) model.summary() return model
def test_invalid_head_num(self): with self.assertRaises(IndexError): input_layer = keras.layers.Input( shape=(2, 3), name='Input', ) MultiHeadAttention( head_num=2, name='Multi-Head', )(input_layer)
def build_model(emb_cid, emb_advid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) content = layers.Concatenate()([emb1, emb2]) mha = MultiHeadAttention(head_num=16)(content) mha = layers.Dropout(0.01)(mha) mha = layers.Add()([content, mha]) mha = LayerNormalization()(mha) mha = layers.Dropout(0.01)(mha) mha_ff = FeedForward(256)(mha) mha_out = layers.Add()([mha, mha_ff]) mha_out = LayerNormalization()(mha_out) lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(mha_out) avg_pool = layers.GlobalAveragePooling1D()(lstm) max_pool = layers.GlobalMaxPool1D()(lstm) x = layers.Concatenate()([avg_pool, max_pool]) x = layers.Dense(128, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dense(64, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dropout(0.1)(x) out = layers.Dense(10, activation='softmax')(x) model = keras.Model(inputs=[inp1, inp2], outputs=out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(1e-3), metrics=['accuracy']) return model
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1): super(TransformerBlock, self).__init__() self.ffn = Sequential([ Dense(ff_dim, activation="relu"), Dense(embed_dim), ]) self.layernorm1 = LayerNormalization() self.layernorm2 = LayerNormalization() self.dropout1 = Dropout(rate) self.dropout2 = Dropout(rate) self.att = MultiHeadAttention(head_num=num_heads, name='att_layer')
def test_mask_single(self): input_layer = keras.layers.Input(shape=(None, )) embed_layer = keras.layers.Embedding(input_dim=3, output_dim=4, mask_zero=True)(input_layer) att_layer = MultiHeadAttention( head_num=2, name='Multi-Head-2', )(embed_layer) mask_layer = GetMask()(att_layer) model = keras.models.Model(inputs=input_layer, outputs=mask_layer) model.compile(optimizer='adam', loss='mse', metrics={}) predicts = model.predict(np.asarray([[1, 2, 1, 2, 0, 0]])).tolist() self.assertEqual([1.0] * 4 + [0.0] * 2, predicts[0], predicts[0])
def create_muti_head_self_attention_model(maxlen=20000, embedding=256, salayer=256): S_inputs = Input(shape=(maxlen, ), dtype='int32') embeddings = Embedding(maxlen, embedding)(S_inputs) O_seq = MultiHeadAttention( head_num=3, name='Multi-Head', )(embeddings) O_seq = Flatten()(O_seq) outputs = Dense(9, activation='softmax')(O_seq) model = Model(inputs=S_inputs, outputs=outputs) print(model.summary()) return model
def create_model(self, params, index, logger): input1 = keras.Input( batch_shape=(params[index]["batch_size"], params[index]["max_code_length"] + 2, params[index]['dataset'].len_encoding), name='input_1') input2 = keras.Input( batch_shape=(params[index]["batch_size"], params[index]["max_code_length"] + 2, params[index]['dataset'].len_encoding), name='input_2') embedding = Dense(32, name='embedding1') dense1 = embedding(input1) dense2 = embedding(input2) lstm = Bidirectional(LSTM(512, name='lstm', return_sequences=True)) if params[index]["attention"]: attention = MultiHeadAttention(head_num=2, name="multi_head_attention") lstm1 = lstm(dense1) lstm2 = lstm(dense2) if params[index]["attention"]: lstm1 = Flatten()(attention(lstm1)) lstm2 = Flatten()(attention(lstm2)) output_embedding = Dense(512, name="output_embedding") output_embedding1 = output_embedding(lstm1) output_embedding2 = output_embedding(lstm2) distance = Lambda( euclidean_distance, output_shape=eucl_dist_output_shape, name='distance')([output_embedding1, output_embedding2]) return keras.Model(inputs=[input1, input2], outputs=distance, name=self.name + "-" + str(index))
def encoder(seq_len, m_features, d_model, n_heads, dff, rate=0.1, encoder=None): """Basic Attention Encoder. It can be concatenated with a previous encoder by passing it as argument.""" if encoder == None: in_seq = keras.layers.Input(shape=(seq_len, m_features)) in_seq = LayerNormalization()(in_seq) else:: in_seq = encoder.output linear = keras.layers.Dense(units=d_model)(norm_0) pos = TrigPosEmbedding(mode=TrigPosEmbedding.MODE_ADD)(linear) mha = MultiHeadAttention(head_num=n_heads)(pos) mha_drop = keras.layers.Dropout(rate=rate)(mha) add_1 = keras.layers.Add()([pos, mha_drop]) norm_1 = LayerNormalization()(add_1) ff = FeedForward(dff)(norm_1) ff_drop = keras.layers.Dropout(rate=rate)(ff) add_2 = keras.layers.Add()([ff_drop, norm_1]) out = LayerNormalization()(add_2) return keras.Model(in_seq, out) if encoder == None else keras.Model(encoder.input, out)
def build_model(input_shape): main_input = keras.layers.Input(input_shape) input_layer = keras.layers.Input((input_shape[1], input_shape[2])) conv1 = keras.layers.Conv1D(filters=128, kernel_size=8, padding='same')(input_layer) conv1 = keras.layers.normalization.BatchNormalization()(conv1) conv1 = keras.layers.Activation(activation='relu')(conv1) conv2 = keras.layers.Conv1D(filters=256, kernel_size=5, padding='same')(conv1) conv2 = keras.layers.normalization.BatchNormalization()(conv2) conv2 = keras.layers.Activation('relu')(conv2) conv3 = keras.layers.Conv1D(128, kernel_size=3, padding='same')(conv2) conv3 = keras.layers.normalization.BatchNormalization()(conv3) conv3 = keras.layers.Activation('relu')(conv3) gap_layer = keras.layers.pooling.GlobalAveragePooling1D()(conv3) cnn_model = keras.layers.TimeDistributed( keras.models.Model(inputs=input_layer, outputs=gap_layer))(main_input) lstm_layer = keras.layers.LSTM(64, return_sequences=True)(cnn_model) print(lstm_layer) att_layer = MultiHeadAttention(head_num=8)(lstm_layer) print(att_layer) lstm_layer = keras.layers.LSTM(64, return_sequences=True)(att_layer) print(lstm_layer) gap_layerX = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) output_layer = keras.layers.Dense(64, activation='relu')(gap_layerX) output_layer = keras.layers.Dense(2, activation='softmax')(output_layer) model = keras.models.Model(inputs=main_input, outputs=output_layer) model.summary() return model
def loadBertModel(): keras.utils.generic_utils.get_custom_objects().update({'pentanh': Pentanh()}) input = Input((500, 3072)) rnn_layer = Bidirectional(MultiplicativeLSTM(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2, activation='pentanh', recurrent_activation='pentanh'), merge_mode='concat')(input) attention_layer = MultiHeadAttention(head_num=4)(rnn_layer) removeMask = Flatten()(attention_layer) final = Dense(14, activation='sigmoid')(removeMask) model_complete = Model(inputs=input, outputs=final) model_complete.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy', auc_roc]) print(model_complete.summary()) return model_complete
def local_context_learning(input_length, input_dim, output_dim, hidden_dim, filters_num, kernel_val, learning_rate, drop_rate): basic_input = Input(shape=(input_length, input_dim)) label_input = Input(shape=(1, )) weighted_input = adding_weight()(basic_input) def true_process(): return weighted_input def false_process(): return basic_input actual_input = Lambda(lambda x: tf.cond(x > tf.constant(value=0.5), true_fn=true_process(), false_fn=false_process()))( label_input) rnn_output = GRU(hidden_dim, return_sequences=True)(actual_input) rnn_att = SeqSelfAttention(attention_activation='sigmoid')(rnn_output) cnn_output = Conv1D(filters=filters_num, kernel_size=kernel_val, padding="same")(actual_input) cnn_output_reformat = Dense(hidden_dim)(cnn_output) cnn_att = SeqSelfAttention( attention_activation='sigmoid')(cnn_output_reformat) fixed_rnn_output = adding_attention(rnn_output, rnn_att, input_length, hidden_dim) fixed_cnn_output = adding_attention(cnn_output_reformat, cnn_att, input_length, hidden_dim) new_value = Concatenate(axis=1)([fixed_rnn_output, fixed_cnn_output]) new_keys = ones_like(new_value) new_result = MultiHeadAttention(head_num=2)( [actual_input, new_keys, new_value]) result = Flatten()(new_result) result_fix = Dropout(rate=drop_rate)(result) output = Dense(output_dim)(result_fix) fixed_output = Activation(activation='sigmoid')(output) model = Model([basic_input, label_input], fixed_output) ada = adam(lr=learning_rate) model.compile(optimizer=ada, loss='categorical_crossentropy') return model
def block(attention_input, head_num: int, feed_forward_units: int, dropout_rate: float) -> Tensor: attention_x = MultiHeadAttention( head_num=head_num, activation=None, use_bias=False, history_only=True, trainable=True, )(attention_input) attention_x = Dropout(dropout_rate)(attention_x) attention_x = Add()([attention_input, attention_x]) feed_forward_input = LayerNormalization(trainable=True)(attention_x) feed_forward_x = FeedForward(units=feed_forward_units, activation='relu', trainable=True)(feed_forward_input) feed_forward_x = Dropout(dropout_rate)(feed_forward_x) feed_forward_x = Add()([feed_forward_input, feed_forward_x]) block_output = LayerNormalization(trainable=True)(feed_forward_x) return block_output
def build_model(input_shape): emotive_params = 266 n_feature_maps = 64 input_layer = keras.layers.Input(input_shape) lstm_layer = keras.layers.LSTM(emotive_params, return_sequences=True)(input_layer) att_layer = MultiHeadAttention( head_num=14, name='Multi-Head', )(lstm_layer) lstm_layer = keras.layers.LSTM(n_feature_maps, return_sequences=True)(att_layer) gap_layerX = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) output_layer = keras.layers.Dense(n_feature_maps, activation='relu')(gap_layerX) output_layer = keras.layers.Dense(2, activation='softmax')(output_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.summary() return model
def test_sample(self): input_layer = keras.layers.Input( shape=(512, ), name='Input', ) embed_layer = keras.layers.Embedding( input_dim=12, output_dim=768, mask_zero=True, name='Embedding', )(input_layer) output_layer = MultiHeadAttention( head_num=12, name='Multi-Head', )(embed_layer) model = keras.models.Model(inputs=input_layer, outputs=output_layer) model.compile( optimizer='adam', loss='mse', metrics={}, ) model.summary()
def build_albert(token_num, pos_num=512, seq_len=512, embed_dim=128, hidden_dim=768, transformer_num=12, head_num=12, feed_forward_dim=3072, dropout_rate=0.1, attention_activation=None, feed_forward_activation='gelu', training=True, trainable=None, output_layers=None): """Get ALBERT model. See: https://arxiv.org/pdf/1909.11942.pdf :param token_num: Number of tokens. :param pos_num: Maximum position. :param seq_len: Maximum length of the input sequence or None. :param embed_dim: Dimensions of embeddings. :param hidden_dim: Dimensions of hidden layers. :param transformer_num: Number of transformers. :param head_num: Number of heads in multi-head attention in each transformer. :param feed_forward_dim: Dimension of the feed forward layer in each transformer. :param dropout_rate: Dropout rate. :param attention_activation: Activation for attention layers. :param feed_forward_activation: Activation for feed-forward layers. :param training: A built model with MLM and NSP outputs will be returned if it is `True`, otherwise the input layers and the last feature extraction layer will be returned. :param trainable: Whether the model is trainable. :param output_layers: A list of indices of output layers. """ if attention_activation == 'gelu': attention_activation = gelu if feed_forward_activation == 'gelu': feed_forward_activation = gelu if trainable is None: trainable = training def _trainable(_layer): if isinstance(trainable, (list, tuple, set)): for prefix in trainable: if _layer.name.startswith(prefix): return True return False return trainable # Build inputs input_token = keras.layers.Input(shape=(seq_len, ), name='Input-Token') input_segment = keras.layers.Input(shape=(seq_len, ), name='Input-Segment') inputs = [input_token, input_segment] # Build embeddings embed_token, embed_weights, embed_projection = AdaptiveEmbedding( input_dim=token_num, output_dim=hidden_dim, embed_dim=embed_dim, mask_zero=True, trainable=trainable, return_embeddings=True, return_projections=True, name='Embed-Token', )(input_token) embed_segment = keras.layers.Embedding( input_dim=2, output_dim=hidden_dim, trainable=trainable, name='Embed-Segment', )(input_segment) embed_layer = keras.layers.Add(name='Embed-Token-Segment')( [embed_token, embed_segment]) embed_layer = PositionEmbedding( input_dim=pos_num, output_dim=hidden_dim, mode=PositionEmbedding.MODE_ADD, trainable=trainable, name='Embedding-Position', )(embed_layer) if dropout_rate > 0.0: dropout_layer = keras.layers.Dropout( rate=dropout_rate, name='Embedding-Dropout', )(embed_layer) else: dropout_layer = embed_layer embed_layer = LayerNormalization( trainable=trainable, name='Embedding-Norm', )(dropout_layer) # Build shared transformer attention_layer = MultiHeadAttention( head_num=head_num, activation=attention_activation, name='Attention', ) attention_normal = LayerNormalization(name='Attention-Normal') feed_forward_layer = FeedForward(units=feed_forward_dim, activation=feed_forward_activation, name='Feed-Forward') feed_forward_normal = LayerNormalization(name='Feed-Forward-Normal') transformed = embed_layer transformed_layers = [] for i in range(transformer_num): attention_input = transformed transformed = attention_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Attention-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Attention-Add-{}'.format(i + 1), )( [attention_input, transformed]) transformed = attention_normal(transformed) feed_forward_input = transformed transformed = feed_forward_layer(transformed) if dropout_rate > 0.0: transformed = keras.layers.Dropout( rate=dropout_rate, name='Feed-Forward-Dropout-{}'.format(i + 1), )(transformed) transformed = keras.layers.Add( name='Feed-Forward-Add-{}'.format(i + 1), )( [feed_forward_input, transformed]) transformed = feed_forward_normal(transformed) transformed_layers.append(transformed) if training: # Build tasks mlm_dense_layer = keras.layers.Dense( units=hidden_dim, activation=feed_forward_activation, name='MLM-Dense', )(transformed) mlm_norm_layer = LayerNormalization(name='MLM-Norm')(mlm_dense_layer) mlm_pred_layer = AdaptiveSoftmax( input_dim=hidden_dim, output_dim=token_num, embed_dim=embed_dim, bind_embeddings=True, bind_projections=True, name='MLM-Sim', )([mlm_norm_layer, embed_weights, embed_projection]) masked_layer = Masked(name='MLM')([mlm_pred_layer, inputs[-1]]) extract_layer = Extract(index=0, name='Extract')(transformed) nsp_dense_layer = keras.layers.Dense( units=hidden_dim, activation='tanh', name='SOP-Dense', )(extract_layer) nsp_pred_layer = keras.layers.Dense( units=2, activation='softmax', name='SOP', )(nsp_dense_layer) model = keras.models.Model(inputs=inputs, outputs=[masked_layer, nsp_pred_layer]) for layer in model.layers: layer.trainable = _trainable(layer) return model if output_layers is not None: if isinstance(output_layers, list): output_layers = [ transformed_layers[index] for index in output_layers ] output = keras.layers.Concatenate(name='Output', )(output_layers) else: output = transformed_layers[output_layers] model = keras.models.Model(inputs=inputs, outputs=output) return model model = keras.models.Model(inputs=inputs, outputs=transformed) for layer in model.layers: layer.trainable = _trainable(layer) return inputs, transformed
def build_model(emb_cid, emb_advid, emb_aid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) inp3 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) emb3 = layers.Embedding(input_dim=emb_aid.shape[0], output_dim=emb_aid.shape[1], input_length=max_len, weights=[emb_aid], trainable=False)(inp3) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) emb3 = sdrop(emb3) id_c = emb1 id_adv_ad = layers.Concatenate()([emb2, emb3]) mha1 = MultiHeadAttention(head_num=16)(id_adv_ad) mha1 = layers.Dropout(0.01)(mha1) mha1 = layers.Add()([id_adv_ad, mha1]) mha1 = LayerNormalization()(mha1) mha1 = layers.Dropout(0.01)(mha1) mha1_ff = FeedForward(128)(mha1) mha1_out = layers.Add()([mha1, mha1_ff]) mha1_out = LayerNormalization()(mha1_out) id_adv_ad_lstm = layers.Bidirectional( layers.LSTM(200, return_sequences=True))(mha1_out) id_adv_ad_max_pool = layers.GlobalMaxPool1D()(id_adv_ad_lstm) mha2 = MultiHeadAttention(head_num=16)(id_c) mha2 = layers.Dropout(0.01)(mha2) mha2 = layers.Add()([id_c, mha2]) mha2 = LayerNormalization()(mha2) mha2 = layers.Dropout(0.01)(mha2) mha2_ff = FeedForward(128)(mha2) mha2_out = layers.Add()([mha2, mha2_ff]) mha2_out = LayerNormalization()(mha2_out) id_c_lstm = layers.Bidirectional(layers.LSTM( 200, return_sequences=True))(mha2_out) id_c_max_pool = layers.GlobalMaxPool1D()(id_c_lstm) x = layers.Add()([id_c_max_pool, id_adv_ad_max_pool]) x = layers.Dense(256, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dense(64, activation='relu')(x) x = layers.BatchNormalization()(x) x = layers.Dropout(0.15)(x) out = layers.Dense(10, activation='softmax')(x) model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out) model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(1e-3), metrics=['accuracy']) return model
def test_fit(self): input_layer = keras.layers.Input( shape=(1, 3), name='Input', ) att_layer = MultiHeadAttention( head_num=3, activation=self._leaky_relu, name='Multi-Head-Attention-1')(input_layer) normal_layer = LayerNormalization( name='Layer-Normalization-1', )(att_layer) feed_forward_layer = FeedForward( units=12, activation=self._leaky_relu, name='FeedForward', )(normal_layer) normal_layer = LayerNormalization( name='Layer-Normalization-2', )(feed_forward_layer) output_layer = keras.layers.Add(name='Add')( [input_layer, normal_layer]) model = keras.models.Model( inputs=input_layer, outputs=output_layer, ) model.compile( optimizer='adam', loss='mse', metrics={}, ) def _generator(batch_size=32): while True: batch_inputs = np.random.random((batch_size, 1, 3)) batch_outputs = batch_inputs + 0.2 yield batch_inputs, batch_outputs model.fit_generator( generator=_generator(), steps_per_epoch=1000, epochs=10, validation_data=_generator(), validation_steps=100, callbacks=[ keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) ], ) model_path = os.path.join( tempfile.gettempdir(), 'keras_feed_forward_%f.h5' % np.random.random()) model.save(model_path) model = keras.models.load_model( model_path, custom_objects={ '_leaky_relu': self._leaky_relu, 'MultiHeadAttention': MultiHeadAttention, 'LayerNormalization': LayerNormalization, 'FeedForward': FeedForward, }, ) for inputs, _ in _generator(batch_size=3): predicts = model.predict(inputs) expect = inputs + 0.2 for i in range(3): for j in range(3): self.assertTrue( np.abs(expect[i, 0, j] - predicts[i, 0, j]) < 0.1, (expect, predicts)) break
train_x = wv.Zero_padding(train_x,Maxseq_length) test_x = wv.Zero_padding(test_x,Maxseq_length) #train_y = to_categorical(train_y) #test_y = to_categorical(test_y) model = Sequential() model.add(Conv1D(32, 3, padding='valid', activation='custom_gelu', strides=1)) model.add(MaxPooling1D(pool_size=2)) model.add(MultiHeadAttention( head_num=32, name='Multi-Head-Attention', )) model.add(layers.LSTM(32,return_sequences=True)) model.add(layers.Flatten()) model.add(Dropout(0.5)) model.add(layers.Dense(1,activation='sigmoid')) model.build() model.summary model.compile(optimizer=Adam(lr=0.001),loss=keras.losses.binary_crossentropy,metrics=['accuracy']) model.fit(train_x, train_y, batch_size=64,epochs=60) scores = model.evaluate(test_x, test_y, verbose=2) print("Accuracy: %.2f%%" % (scores[1]*100)) """ text = layers.Conv2D(64,(3,300),padding='valid',strides = 1)(news_input) text1 = layers.MaxPooling2D(pool_size = (1,1))(text)
def build_model(input_shape): n_feature_maps = 64 main_input = keras.layers.Input(input_shape) input_layer = keras.layers.Input((input_shape[1], input_shape[2])) # input_layer = PreProcessingLayer(266)(input_layer) # print(input_layer) # BLOCK 1 conv_x = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=8, padding='same')(input_layer) conv_x = keras.layers.normalization.BatchNormalization()(conv_x) conv_x = keras.layers.Activation('relu')(conv_x) conv_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=5, padding='same')(conv_x) conv_y = keras.layers.normalization.BatchNormalization()(conv_y) conv_y = keras.layers.Activation('relu')(conv_y) conv_z = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=3, padding='same')(conv_y) conv_z = keras.layers.normalization.BatchNormalization()(conv_z) # expand channels for the sum shortcut_y = keras.layers.Conv1D(filters=n_feature_maps, kernel_size=1, padding='same')(input_layer) shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) output_block_1 = keras.layers.add([shortcut_y, conv_z]) output_block_1 = keras.layers.Activation('relu')(output_block_1) # BLOCK 2 conv_x = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_1) conv_x = keras.layers.normalization.BatchNormalization()(conv_x) conv_x = keras.layers.Activation('relu')(conv_x) conv_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x) conv_y = keras.layers.normalization.BatchNormalization()(conv_y) conv_y = keras.layers.Activation('relu')(conv_y) conv_z = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y) conv_z = keras.layers.normalization.BatchNormalization()(conv_z) # expand channels for the sum shortcut_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=1, padding='same')(output_block_1) shortcut_y = keras.layers.normalization.BatchNormalization()(shortcut_y) output_block_2 = keras.layers.add([shortcut_y, conv_z]) output_block_2 = keras.layers.Activation('relu')(output_block_2) # BLOCK 3 conv_x = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=8, padding='same')(output_block_2) conv_x = keras.layers.normalization.BatchNormalization()(conv_x) conv_x = keras.layers.Activation('relu')(conv_x) conv_y = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=5, padding='same')(conv_x) conv_y = keras.layers.normalization.BatchNormalization()(conv_y) conv_y = keras.layers.Activation('relu')(conv_y) conv_z = keras.layers.Conv1D(filters=n_feature_maps * 2, kernel_size=3, padding='same')(conv_y) conv_z = keras.layers.normalization.BatchNormalization()(conv_z) # no need to expand channels because they are equal shortcut_y = keras.layers.normalization.BatchNormalization()( output_block_2) output_block_3 = keras.layers.add([shortcut_y, conv_z]) output_block_3 = keras.layers.Activation('relu')(output_block_3) print(output_block_3) # FINAL gap_layer = keras.layers.GlobalAveragePooling1D()(output_block_3) print(gap_layer) cnn_model = keras.layers.TimeDistributed( keras.models.Model(inputs=input_layer, outputs=gap_layer))(main_input) print(cnn_model) pos_enc = PreProcessingLayer()(cnn_model) print(pos_enc) # lstm_layer = keras.layers.Bidirectional(keras.layers.LSTM(n_feature_maps, return_sequences=True))(cnn_model) lstm_layer = keras.layers.LSTM(64, return_sequences=True)(pos_enc) att_layer = MultiHeadAttention(head_num=16)(lstm_layer) lstm_layer = keras.layers.LSTM(64, return_sequences=True)(att_layer) gap_layerX = keras.layers.pooling.GlobalAveragePooling1D()(lstm_layer) output_layer = keras.layers.Dense(64, activation='relu')(gap_layerX) output_layer = keras.layers.Dense(2, activation='softmax')(output_layer) model = keras.models.Model(inputs=main_input, outputs=output_layer) model.summary() return model
def get_model(self, params, a=False, b=False, c=False, d=False, e=False, f=False, g=False, dropout=0.5): hash_input = layers.Input(shape=(params['max_words'], ), dtype='int32') x = layers.Embedding(params['hash_mole'], params['embed_size'], input_length=params['max_words'], name=self.embedding_name)(hash_input) x = layers.Dropout(dropout / 3)(x) if a: # did not train # needs positional embedding? x = MultiHeadAttention(4)(x) x = layers.Dropout(dropout / 3)(x) if b: x = layers.Bidirectional( self.get_lstm(params['units'] // 2, return_sequences=True))(x) x = layers.Dropout(dropout)(x) x = layers.TimeDistributed(MultiHeadAttention(4))(x) #x = layers.Flatten()(x) #x = layers.Dropout(dropout)(x) #x = layers.Dense(params['embed_size'])(x) x = layers.Dropout(dropout / 3)(x) #if c: x = layers.Bidirectional( self.get_lstm(params['units'] // 2, return_sequences=False, name=self.bidirectional_name))(x) x = layers.Dropout(dropout)(x) x = layers.RepeatVector(params['num_sylls'])(x) x = layers.Dropout(dropout)(x) if d: x = PositionEmbedding(input_dim=params['embed_size'], output_dim=params['num_sylls'] * 4, mode=PositionEmbedding.MODE_CONCAT)(x) x = layers.Dropout(dropout)(x) x = MultiHeadAttention(4)(x) x = layers.Dropout(dropout)(x) x = self.get_lstm(params['units'], return_sequences=True, name=self.cu_dnnlstm_name)(x) if e: x = PositionEmbedding(input_dim=params['units'], output_dim=params['units'], mode=PositionEmbedding.MODE_ADD)(x) x = layers.Dropout(dropout)(x) #x = layers.Dense(params['units'])(x) #x = layers.Dropout(dropout)(x) if f: # this was somewhat effective x = MultiHeadAttention(2)(x) x = layers.Dropout(dropout)(x) if g: x = layers.Dense(params['units'], kernel_initializer='identity', name='dense_identity', activation='relu')(x) x = layers.Dropout(dropout)(x) output_layer = layers.Dense(params['max_features'], activation='softmax', name=self.dense_name)(x) model = Model(inputs=[hash_input], outputs=[output_layer]) return model
def build_model(emb_cid, emb_advid, emb_aid): inp1 = layers.Input(shape=(max_len, )) inp2 = layers.Input(shape=(max_len, )) inp3 = layers.Input(shape=(max_len, )) emb1 = layers.Embedding(input_dim=emb_cid.shape[0], output_dim=emb_cid.shape[1], input_length=max_len, weights=[emb_cid], trainable=False)(inp1) emb2 = layers.Embedding(input_dim=emb_advid.shape[0], output_dim=emb_advid.shape[1], input_length=max_len, weights=[emb_advid], trainable=False)(inp2) emb3 = layers.Embedding(input_dim=emb_aid.shape[0], output_dim=emb_aid.shape[1], input_length=max_len, weights=[emb_aid], trainable=False)(inp3) sdrop = layers.SpatialDropout1D(rate=0.2) emb1 = sdrop(emb1) emb2 = sdrop(emb2) emb3 = sdrop(emb3) content = layers.Concatenate()([emb1, emb2, emb3]) mha1 = MultiHeadAttention(head_num=32)(content) mha1 = layers.Dropout(0.01)(mha1) mha1 = layers.Add()([content, mha1]) mha1 = LayerNormalization()(mha1) mha1 = layers.Dropout(0.01)(mha1) mha1_ff = FeedForward(256)(mha1) mha1_out = layers.Add()([mha1, mha1_ff]) mha1_out = LayerNormalization()(mha1_out) mha2 = MultiHeadAttention(head_num=32)(mha1_out) mha2 = layers.Dropout(0.01)(mha2) mha2 = layers.Add()([mha1_out, mha2]) mha2 = LayerNormalization()(mha2) mha2 = layers.Dropout(0.01)(mha2) mha2_ff = FeedForward(256)(mha2) mha2_out = layers.Add()([mha2, mha2_ff]) mha2_out = LayerNormalization()(mha2_out) mha3 = MultiHeadAttention(head_num=32)(mha2_out) mha3 = layers.Dropout(0.01)(mha3) mha3 = layers.Add()([mha2_out, mha3]) mha3 = LayerNormalization()(mha3) mha3 = layers.Dropout(0.01)(mha3) mha3_ff = FeedForward(256)(mha3) mha3_out = layers.Add()([mha3, mha3_ff]) mha3_out = LayerNormalization()(mha3_out) avg_pool = layers.GlobalAveragePooling1D()(mha3_out) max_pool = layers.GlobalMaxPool1D()(mha3_out) x = layers.Concatenate()([avg_pool, max_pool]) x = layers.Dense(256)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) x = layers.Dense(128)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) x = layers.Dense(64)(x) x = layers.BatchNormalization()(x) x = layers.Activation('relu')(x) x = layers.Dropout(0.2)(x) out = layers.Dense(1, activation='sigmoid')(x) model = keras.Model(inputs=[inp1, inp2, inp3], outputs=out) model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.Adam(2e-4), metrics=['accuracy']) return model