def __call__(self, hidden_states): """ SelfAttention is originally proposed by Cheng et al., 2016 https://arxiv.org/pdf/1601.06733.pdf Here using the implementation of Philipperemy from https://github.com/philipperemy/keras-attention-mechanism/blob/master/attention/attention.py with modification that `attn_units` and `attn_activation` attributes can be changed. The default values of these attributes are same as used by the auther. However, there is another implementation of SelfAttention at https://github.com/CyberZHG/keras-self-attention/blob/master/keras_self_attention/seq_self_attention.py but the author have cited a different paper i.e. Zheng et al., 2018 https://arxiv.org/pdf/1806.01264.pdf and named it as additive attention. A useful discussion about this (in this class) implementation can be found at https://github.com/philipperemy/keras-attention-mechanism/issues/14 Many-to-one attention mechanism for Keras. @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). @return: 2D tensor with shape (batch_size, 128) @author: felixhao28. The original code which has here been modified had Apache Licence 2.0. """ hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec' + self.context)(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state' + self.context)(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score' + self.context) attention_weights = Activation('softmax', name='attention_weight' + self.context)(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector' + self.context) pre_activation = concatenate([context_vector, h_t], name='attention_output' + self.context) attention_vector = Dense(self.attn_units, use_bias=False, activation=self.attn_activation, name='attention_vector' + self.context)(pre_activation) return attention_vector
def finetuning_siamese_cnn(mymodel_tmp, num_frame, num_neg_singers, num_pos_tracks): anchor = Input(shape=(num_frame, config.n_mels)) pos_items = [ Input(shape=(num_frame, config.n_mels)) for i in range(num_pos_tracks) ] neg_items = [ Input(shape=(num_frame, config.n_mels)) for i in range(num_neg_singers) ] dense = Dense(256) ap = GlobalAvgPool1D() anchor_out = mymodel_tmp(anchor) pos_outs = [mymodel_tmp(pos_item) for pos_item in pos_items] neg_outs = [mymodel_tmp(neg_item) for neg_item in neg_items] ### cosine pos_dists = [ dot([anchor_out, pos_out], axes=1, normalize=True) for pos_out in pos_outs ] neg_dists = [ dot([anchor_out, neg_out], axes=1, normalize=True) for neg_out in neg_outs ] all_dists = concatenate(pos_dists + neg_dists) outputs = Activation('linear')(all_dists) model = Model(inputs=[anchor] + pos_items + neg_items, outputs=outputs) return model
def getModelInstance(parameters): encoderInput = Input(shape=(None, parameters["enc_vocab_size"],)) encoder = Bidirectional(LSTM(128, return_sequences=True, return_state=True), merge_mode='concat') encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(encoderInput) encoderH = concatenate([forward_h, backward_h]) encoderC = concatenate([forward_c, backward_c]) decoderInput = Input(shape=(None, parameters["dec_vocab_size"],)) decoderLstm = LSTM(256, return_sequences=True) decoderOutput = decoderLstm(decoderInput, initial_state=[encoderH, encoderC]) attention = dot([decoderOutput, encoder_outputs], axes=(2, 2)) attention = Activation('softmax', name='attention')(attention) context = dot([attention, encoder_outputs], axes=(2, 1)) decoderCombined = concatenate([context, decoderOutput]) output = TimeDistributed(Dense(128, activation="relu"))(decoderCombined) output = TimeDistributed(Dense(parameters["dec_vocab_size"], activation="softmax"))(output) model = Model([encoderInput, decoderInput], [output]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) return model
def __call__(self, hidden_states): # hidden_states가 아니라 outputs가 넘어온다. # 시계열 데이터가 아니므로 decoder를 정의하지 않는다. """ Many-to-one attention mechanism for Keras. @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). @return: 2D tensor with shape (batch_size, 128) @author: felixhao28. """ hidden_size = int(hidden_states.shape[2]) # 1) 어텐션 스코어(Attention Score)를 구한다. # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) 행렬곱을 하면 2차원의 행렬곱셈을 batch_size만큼 # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states) # 마지막 hiddenstate만 갖고 온다. x[:, -1, :] score = dot([score_first_part, h_t], [2, 1], name='attention_score') # 행렬곱 [2, 1] time_steps, hidden_size->hidden_size # 2) 소프트맥스(softmax) 함수를 통해 어텐션 분포(Attention Distribution)를 구한다. attention_weights = Activation('softmax', name='attention_weight')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) # 3) 각 인코더의 어텐션 가중치와 은닉 상태를 가중합하여 어텐션 값(Attention Value)을 구한다. context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') # 4) 어텐션 값과 디코더의 t 시점의 은닉 상태를 연결한다.(Concatenate) pre_activation = concatenate([context_vector, h_t], name='attention_output') # 5) 출력층 연산의 입력이 되는 s~t를 계산합니다. attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) return attention_vector
def ls(yt, yp): f = dot([self.y2, self.x2], axes=-1, normalize=True) fg = dot([self.g2, self.x2], axes=-1, normalize=True) r = maximum(0.0, 0.3 + subtract([fg, f])) r = sum(r, axis=-1) return mean(r) # batch
def attention_3d_block(self, hidden_states): """Attention mechanism. Reference - https://github.com/philipperemy/keras-attention-mechanism Args: - hidden_states: RNN hidden states (3d array) Return: - attention_vector: output states after attention mechanism. """ # hidden_states.shape = (batch_size, time_steps, hidden_size) hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score') attention_weights = Activation('softmax', name='attention_weight')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') pre_activation = concatenate([context_vector, h_t], name='attention_output') attention_vector = Dense(self.h_dim, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) return attention_vector
def attention_3d_block(hidden_states, dense_activation='tanh'): """ Many-to-one attention mechanism for Keras. @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). @return: 2D tensor with shape (batch_size, 128) @author: felixhao28. """ hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size, ), name='last_hidden_state')(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score') attention_weights = Activation('softmax', name='attention_weight')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') pre_activation = concatenate([context_vector, h_t], name='attention_output') attention_vector = Dense(128, use_bias=False, activation=dense_activation, name='attention_vector')(pre_activation) return attention_vector
def read(self, keys, scale=None): """Read from memory. Read the memory for given the keys. For each key in keys we will get one result as `r = sum_i M[i] a[i]` where `M[i]` is the memory content at location i and `a[i]` is the attention weight for key at location i. `a` is calculated as softmax of a scaled similarity between key and each memory content: `a[i] = exp(scale*sim[i])/(sum_i scale*sim[i])` Args: keys (Tensor): shape[-1] is dim. For single key read, the shape is (batch_size, dim). For multiple key read, the shape is (batch_szie, k, dim), where k is the number of keys. scale (None|float|Tensor): shape is () or keys.shape[:-1]. The cosine similarities are multiplied with `scale` before softmax is applied. If None, use the scale provided at constructor. Returns: resutl Tensor: shape is same as keys. result[..., i] is the read result for the corresponding key. """ if not self._built: self.build(keys.shape[0]) assert 2 <= len(keys.shape) <= 3 assert keys.shape[0] == self._batch_size assert keys.shape[-1] == self.dim if scale is None: scale = self._scale else: if isinstance(scale, (int, float)): pass else: # assuming it's Tensor scale = expand_dims_as(scale, keys) sim = layers.dot([keys, self._memory], axes=-1, normalize=self._normalize) sim = sim * scale attention = activations.softmax(sim) result = layers.dot([attention, self._memory], axes=(-1, 1)) if len(sim.shape) > 2: # multiple read keys usage = tf.reduce_sum(attention, axis=tf.range(1, len(sim.shape) - 1)) else: usage = attention if self._snapshot_only: self._usage.assign_add(usage) else: self._usage = self._usage + usage return result
def attention_block(hidden_states): print(hidden_states.shape) hidden_size = int(hidden_states.shape[2]) score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score') attention_weights = Activation('softmax', name='attention_weight')(score) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') pre_activation = concatenate([context_vector, h_t], name='attention_output') attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) return attention_vector
def attention_3d_block(hidden_states): """ Many-to-one attention mechanism for Keras. @param hidden_states: 3D tensor with shape (batch_size, time_steps, input_dim). @return: 2D tensor with shape (batch_size, 128) @author: felixhao28. """ if False: hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size, ), name='last_hidden_state')(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score') attention_weights = Activation('softmax', name='attention_weight')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') pre_activation = concatenate([context_vector, h_t], name='attention_output') attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) return attention_vector """ Many-to-one attention mechanism for Keras. (modified version) @author: ysmoon """ hidden_size = int(hidden_states.shape[2]) query = Dense(hidden_size, use_bias=False, name="query")(hidden_states) key = Dense(hidden_size, use_bias=False, name="key")(hidden_states) score = dot([query, key], [2, 2]) # [batch, seq, seq] attention_weights = Activation('softmax', name='attention_weight')(score) value = Dense(hidden_size, use_bias=False, name="value")(hidden_states) context_vector = dot([attention_weights, value], [2, 1]) context = tf.keras.backend.max(context_vector, axis=2) attention_vector = Dense(128, use_bias=False, activation='tanh', name='attention_vector')(context) return attention_vector
def get_model(self): # encoder_inputs shape == (batch_size, encoder_seq_length) self.encoder_inputs = Input(shape=(None, )) # encoder_emb shape == (batch_size, encoder_seq_length, embedding_dim) encoder_emb = Embedding(self.num_encoder_tokens + 1, self.embedding_dim, mask_zero=True)(self.encoder_inputs) # encoder shape == (batch_size, encoder_seq_length, num_encoder_units) self.encoder_outputs = Bidirectional( LSTM(self.num_encoder_units, return_sequences=True, unroll=False))(encoder_emb) self.encoder_outputs = Dense(self.num_decoder_units)( self.encoder_outputs) # encoder_last shape == (batch_size, num_decoder_units) self.encoder_last = self.encoder_outputs[:, -1, :] self.encoder_last.set_shape([None, self.num_decoder_units]) # decoder_inputs shape == (batch_size, decoder_seq_length) self.decoder_inputs = Input(shape=(None, )) # decoder_emb shape == (batch_size, decoder_seq_length, embedding_dim) decoder_emb = Embedding(self.num_decoder_tokens + 1, self.embedding_dim, mask_zero=True)(self.decoder_inputs) # decoder_outputs shape == (batch_size, decoder_seq_length, num_decoder_units) decoder_outputs = LSTM( self.num_decoder_units, return_sequences=True, unroll=False)(decoder_emb, initial_state=[self.encoder_last, self.encoder_last]) # attention shape == (batch_size, decoder_seq_length, max_encoder_seq_length) attention = dot([decoder_outputs, self.encoder_outputs], axes=[2, 2]) attention = Activation("softmax", name="attention")(attention) # context shape == (batch_size, decoder_seq_length, latent_dim) context = dot([attention, self.encoder_outputs], axes=[2, 1]) # decoder_combined_context shape == (batch_size, decoder_seq_length, latent_dim) decoder_combined_context = concatenate([context, decoder_outputs]) # decoder_outputs shape == (batch_size, decoder_seq_length) decoder_outputs = TimeDistributed( Dense(self.num_decoder_units, activation="tanh"))(decoder_combined_context) # decoder_outputs shape == (batch_size, decoder_seq_length, num_decoder_tokens) decoder_outputs = TimeDistributed( Dense(self.num_decoder_tokens, activation="softmax"))(decoder_outputs) return Model([self.encoder_inputs, self.decoder_inputs], decoder_outputs)
def dssm(index2vec, max_reviews=5, dim=32, J=4): # 用户embedding user_input = Input(shape=(max_reviews, ), name='user_input') # 用户点击item的embedding pos_input = Input(shape=(1, ), name='pos_input') # 未点击的embedding neg_inputs = [Input(shape=(1, )) for _ in range(J)] # 用户看过历史item的embedding user_embedding = Embedding(len(index2vec), dim, weights=[index2vec], input_length=max_reviews, trainable=False)(user_input) # 取所有看过的item embedding的平均值 user_average = GlobalAveragePooling1D()(user_embedding) user_fc = Dense(32, activation='relu', name='ufc')(user_average) pos_embedding = Embedding(len(index2vec), dim, weights=[index2vec], trainable=False)(pos_input) neg_embeddings = [ Embedding(len(index2vec), dim, weights=[index2vec], trainable=False)(neg_input) for neg_input in neg_inputs ] pos_flatten = Flatten()(pos_embedding) neg_flattens = [ Flatten()(neg_embedding) for neg_embedding in neg_embeddings ] item_fc = Dense(32, activation='relu', name='ifc') pos_fc = item_fc(pos_flatten) neg_fcs = [item_fc(neg_flatten) for neg_flatten in neg_flattens] user_product_pos = dot([user_fc, pos_fc], axes=1, normalize=True) user_product_negs = [ dot([user_fc, neg_fc], axes=1, normalize=True) for neg_fc in neg_fcs ] concat = concatenate([user_product_pos] + user_product_negs) ctr = Activation("softmax")(concat) model = Model(inputs=[user_input, pos_input] + neg_inputs, outputs=ctr) model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=['acc']) return model
def get_qpair_model(): embedding_size = 128 inp1 = layers.Input(shape=(100, )) inp2 = layers.Input(shape=(100, )) x1 = layers.Embedding(6000, embedding_size)(inp1) x2 = layers.Embedding(6000, embedding_size)(inp2) x3 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(x1) x4 = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(x2) x5 = layers.GlobalMaxPool1D()(x3) x6 = layers.GlobalMaxPool1D()(x4) x7 = layers.dot([x5, x6], axes=1) x8 = layers.Dense(40, activation='relu')(x7) x9 = layers.Dropout(0.05)(x8) x10 = layers.Dense(10, activation='relu')(x9) output = layers.Dense(2, activation="softmax")(x10) model = models.Model(inputs=[inp1, inp2], outputs=output) model.compile(loss='CategoricalCrossentropy', optimizer='adam', metrics=['accuracy']) # batch_size = 100 # epochs = 3 return model
def build(self, vector_dim=5, learn_rate=0.1): self.embedding_size = vector_dim if os.path.exists(self.trained_weights_path): self.model = load_model(self.trained_weights_path) else: stddev = 1.0 / vector_dim initializer = tf.random_normal_initializer(mean=0.0, stddev=stddev, seed=None) business_input = Input(shape=(1,), name="business_input") business_emnbedding = Embedding(input_dim=self.business_size, output_dim=vector_dim, input_length=1, name="input_embedding", embeddings_initializer=initializer)(business_input) target_input = Input(shape=(1,), name="business_target") target_embedding = Embedding(input_dim=self.business_size, output_dim=vector_dim, input_length=1, name="target_embedding", embeddings_initializer=initializer)(target_input) merged = dot([business_emnbedding, target_embedding], axes=2, normalize=False, name="dot") merged = Flatten()(merged) output = Dense(1, activation='sigmoid', name="output")(merged) model = Model(inputs=[business_input, target_input], outputs=output) model.compile(loss="binary_crossentropy", optimizer=Adam(learn_rate), metrics=['accuracy']) self.model = model logging.info(self.model.summary())
def monotonic_alignment(args): h_enc, h_dec, T_x, T_y, Y, hidden_dim = args struc_zeros = K.expand_dims( K.cast(np.triu(np.ones([T_x, T_x])), dtype='float32'), 0) alignment_probs = K.softmax( dot([Dense(hidden_dim)(h_enc), h_dec], axes=-1, normalize=False), -2) h_enc_rep = K.tile(K.expand_dims(h_enc, -2), [1, 1, T_y, 1]) h_dec_rep = K.tile(K.expand_dims(h_dec, -3), [1, T_x, 1, 1]) h_rep = K.concatenate([h_enc_rep, h_dec_rep], -1) alignment_probs_ = [] for i in range(T_y): if i == 0: align_prev_curr = tf.gather(alignment_probs, i, axis=-1) if i > 0: align_prev_curr = tf.einsum('nx,ny->nxy', tf.gather(alignment_probs, i, axis=-1), alignment_probs_[i - 1]) align_prev_curr *= struc_zeros align_prev_curr = K.sum(align_prev_curr, 1) + 1e-6 align_prev_curr /= K.sum(align_prev_curr, -1, keepdims=True) alignment_probs_.append(align_prev_curr) alignment_probs_ = K.stack(alignment_probs_, -1) emission_probs = Dense(hidden_dim * 3, activation='tanh')(h_rep) emission_probs = Dense(Y, activation='softmax')(emission_probs) #alphas = tf.expand_dims(alignment_probs_,-1)*emission_probs #return(tf.reduce_sum(alphas,-3)) return (alignment_probs_, emission_probs)
def build_model(embedding_layer,embedding_layer_entity,max_len): sequence_input = Input(shape=(max_len,)) entity_input = Input(shape=(2,),) embedded_sequences = embedding_layer(sequence_input) embedded_entity = embedding_layer_entity(entity_input) #print(entity_input.shape) x = Conv1D(128, 3, activation='relu',padding='same')(embedded_sequences) x1 = Conv1D(128, 2, activation='relu')(embedded_entity) ###aspect based attention block con = Concatenate(axis = 1)([x,x1]) x2 = Dense(1,activation= 'tanh')(con) x2 = Flatten()(x2) x2 = Activation('softmax')(x2) x2 = RepeatVector(64)(x2) x2 = dot([x,x2],axes = 1) x2 = Permute([2, 1])(x2) ###attention end x = MaxPooling1D(3)(x2) x = Conv1D(128, 3, activation='relu')(x) x = MaxPooling1D(3)(x) x = Conv1D(128, 3, activation='relu')(x) x = MaxPooling1D(3)(x) # global max pooling x = Flatten()(x) x = Dense(128, activation='relu')(x) #x = concatenate([x,d]) preds = Dense(1, activation='sigmoid')(x) model = Model([sequence_input,entity_input], preds) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc',f1_m,precision_m, recall_m]) return model
def make_model(self): with k.name_scope("SVM_features"): svm_features = Input(shape = (self.svm_dims,), name = "svm_features") svm_input = Dense(128, activation = "tanh", name = "svm_dense")(svm_features) # svm_input = LeakyReLU()(svm_input) with k.name_scope("LSTM_features"): lstm_features = Input(shape = (None, self.input_shape), name = "lstm_features") lstm_mask = Masking(mask_value = Config.MASKING_VALUE, input_shape = (self.time_steps, self.input_shape))(lstm_features) lstm_output, state_h, state_c = LSTM(Config.LSTM_UNITS, return_sequences = True, return_state = True, name = "lstm_sequence")(lstm_mask) # lstm_output_last = LSTM(Config.LSTM_UNITS, return_sequences = False, name = "lstm_last_output")(lstm_mask) with k.name_scope("AttentionLayer_1"): __, lstm_output_ex_last = Lambda(lambda t: [t, t[:, :-1, :]], name = "lstm_T1_Tn-1")(lstm_output) lstm_output_last = state_h attention_weights1 = dot([lstm_output_last, lstm_output_ex_last], name = "attention_weights1", axes = -1) # [B, 1, M] attention_weights2 = Activation("softmax", name = "attention_weights2")(attention_weights1) lstm_attention = dot([attention_weights2, lstm_output_ex_last], name = "lstm_attention", axes = 1) # final_attention = concatenate([lstm_attention, lstm_output_last]) print(lstm_attention) """ with k.name_scope("AttentionLayer_2"): # Attention layer 2 - attention params input_attention = Input(shape = (Config.ATTENTION_UNITS, ), name = "attention_params") u = Dense(Config.ATTENTION_UNITS, activation = "softmax", name = "attention_u")(input_attention) alpha = dot([u, lstm_output], axes = -1) alpha = Activation("softmax", name = "attention_weights")(alpha) # weighted pool lstm_attention = dot([alpha, lstm_output], name = "attention_output", axes = 1) """ with k.name_scope("Concatenate"): x = concatenate([lstm_attention, svm_input]) x_dense = Dense(128, activation = "tanh")(x) # x_dense = LeakyReLU()(x_dense) dense_2 = Dense(128, activation = "tanh")(x_dense) batchnorm2 = BatchNormalization()(dense_2) dropout = Dropout(rate = 0.3, name = "dropout")(batchnorm2) pred = Dense(self.num_classes, activation = "softmax", name = "output")(dropout) self.model = Model(inputs = [svm_features, lstm_features], outputs = [pred]) return self.model
def attention_3d_block(hidden_states): # @author: felixhao28. # hidden_states.shape = (batch_size, time_steps, hidden_size) hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size, ), name='last_hidden_state')(hidden_states) score = dot([score_first_part, h_t], [2, 1], name='attention_score') attention_weights = Activation('softmax', name='attention_weight')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1], name='context_vector') pre_activation = concatenate([context_vector, h_t], name='attention_output') attention_vector = Dense(256, use_bias=False, activation='tanh', name='attention_vector')(pre_activation) return attention_vector
def attn(hidden_states,name='Attention_layer'): hidden_size = int(hidden_states.shape[2]) # Inside dense layer # hidden_states dot W => score_first_part # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size) # W is the trainable weight matrix of attention Luong's multiplicative style score score_first_part = Dense(hidden_size, use_bias=False)(hidden_states) # score_first_part dot last_hidden_state => attention_weights # (batch_size, time_steps, hidden_size) dot (batch_size, hidden_size) => (batch_size, time_steps) h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,))(hidden_states) score = dot([score_first_part, h_t], [2, 1]) attention_weights = Activation('softmax')(score) # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size) context_vector = dot([hidden_states, attention_weights], [1, 1]) pre_activation = concatenate([context_vector, h_t]) attention_vector = Dense(128, use_bias=False, activation='tanh')(pre_activation) return attention_vector
def get_siamese_model(input_shape): """ Model architecture """ # Define the tensors for the two input images left_input = Input(input_shape) right_input = Input(input_shape) # Convolutional Neural Network model = Sequential() model.add( Conv2D(64, (10, 10), activation='relu', input_shape=input_shape, kernel_initializer="random_uniform", kernel_regularizer=l2(2e-4))) model.add(MaxPooling2D()) model.add( Conv2D(128, (7, 7), activation='relu', kernel_initializer="random_uniform", bias_initializer="zeros", kernel_regularizer=l2(2e-4))) model.add(MaxPooling2D()) model.add( Conv2D(128, (4, 4), activation='relu', kernel_initializer="random_uniform", bias_initializer="zeros", kernel_regularizer=l2(2e-4))) model.add(MaxPooling2D()) model.add( Conv2D(256, (4, 4), activation='relu', kernel_initializer="random_uniform", bias_initializer="zeros", kernel_regularizer=l2(2e-4))) model.add(Flatten()) model.add( Dense(4096, activation='sigmoid', kernel_regularizer=l2(1e-3), kernel_initializer="random_uniform", bias_initializer="zeros")) # Generate the encodings (feature vectors) for the two images encoded_l = model(left_input) encoded_r = model(right_input) similarity = dot([encoded_l, encoded_r], axes=-1, normalize=True) # Connect the inputs with the outputs siamese_net = Model(inputs=[left_input, right_input], outputs=similarity) # return the model return siamese_net
def create_model(self): output_len = self.max_seq_length inputt = Input(shape=(self.max_seq_length), dtype='int32') emb = self.one_hot_layer() emb.trainable = True embedded = emb(inputt) conv2 = Conv1D( self.latent_dim, kernel_size=2, activation='tanh', padding='same' #,dilation_rate=2 )(embedded) lstm_input = concatenate([embedded, conv2]) encoder_output = Bidirectional( LSTM(self.latent_dim, return_sequences=True), input_shape=(output_len, self.token_count), )(lstm_input) # Due to `return_sequences` the encoder outputs are of shape # (X, sequence_length, 2 x LSTM hidden dim). # we only need the last timestep for our decoder input encoder_last = encoder_output[:, -1, :] repeated = RepeatVector(output_len)(encoder_last) decoder_output = Bidirectional( LSTM(self.latent_dim, return_sequences=True))(repeated) # custom attention attention = dot([decoder_output, encoder_output], axes=[2, 2]) attention = Activation('softmax', name='attention')(attention) context = dot([attention, encoder_output], axes=[2, 1]) decoder_combined_context = concatenate([context, decoder_output]) td_dense = TimeDistributed(Dense(self.latent_dim, activation='tanh')) output_1 = td_dense(decoder_combined_context) output = self.output_layer()(output_1) self.model = Model(inputs=inputt, outputs=output) self.compile_model()
def self_attention(x): ''' . stands for dot product * stands for elemwise multiplication m = x . transpose(x) n = softmax(m) o = n . x a = o * x return a ''' m = dot([x, x], axes=[2, 2]) n = Activation('softmax')(m) o = dot([n, x], axes=[2, 1]) a = multiply([o, x]) return a
def skeleton_cnn(num_frame, weights): x_input = Input(shape=(num_frame, 128)) # audio model conv1 = Conv1D(128, kernel_size=3, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_normal') bn1 = BatchNormalization() activ1 = LeakyReLU(0.2) # activ1 = Activation('relu') mp1 = MaxPool1D(pool_size=3) conv2 = Conv1D(128, kernel_size=3, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_normal') bn2 = BatchNormalization() activ2 = LeakyReLU(0.2) # activ2 = Activation('relu') mp2 = MaxPool1D(pool_size=3) conv3 = Conv1D(128, kernel_size=3, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_normal') bn3 = BatchNormalization() activ3 = LeakyReLU(0.2) # activ3 = Activation('relu') mp3 = MaxPool1D(pool_size=3) do3 = Dropout(0.5) conv4 = Conv1D(128, kernel_size=3, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_normal') bn4 = BatchNormalization() activ4 = LeakyReLU(0.2) # activ4 = Activation('relu') mp4 = MaxPool1D(pool_size=3) conv5 = Conv1D(256, kernel_size=1, padding='same', use_bias=True, kernel_regularizer=l2(1e-5), kernel_initializer='he_normal') bn5 = BatchNormalization() activ5 = LeakyReLU(0.2) # activ5 = Activation('relu') do5 = Dropout(0.5) ap = GlobalAvgPool1D() # Anchor out = mp1(activ1(bn1(conv1(x_input)))) out = mp2(activ2(bn2(conv2(out)))) out = mp3(activ3(bn3(conv3(out)))) out = do3(out) out = mp4(activ4(bn4(conv4(out)))) out = activ5(bn5(conv5(out))) out = do5(out) out = ap(out) # out = Dense(num_artist, activation='softmax')(out) out = dot([out, out], axes=1, normalize=True) out = Activation('linear')(out) model = Model(inputs=x_input, outputs = out) model.load_weights(weights) return model
def __init__(self, dictionarySize=2500, sentenceLength=30): # settings self.dictionarySize = dictionarySize self.sentenceLength = sentenceLength # keras overall model embedding = Embedding(dictionarySize, 128, mask_zero=True, input_length=None) encoder = LSTM(256, return_sequences=True, return_state=True) decoder = LSTM(256, return_sequences=True, return_state=True) classifierLayer1 = TimeDistributed(Dense(256, activation='tanh')) classifierLayer2 = TimeDistributed( Dense(dictionarySize, activation='softmax')) questions = Input(shape=(None, ), dtype='int32') answers = Input(shape=(None, ), dtype='int32') embeddedQuestions = embedding(questions) embeddedAnswers = embedding(answers) encoded, h, c = encoder(embeddedQuestions) decoded, _, _ = decoder(embeddedAnswers, initial_state=[h, c]) attention = Activation('softmax')(dot([encoded, decoded], axes=[2, 2])) context = dot([attention, encoded], axes=[2, 1]) features = concatenate([decoded, context]) distributions = classifierLayer2(classifierLayer1(features)) self.kerasOverallModel = Model([questions, answers], distributions) self.kerasOverallModel.compile(optimizer='rmsprop', loss='categorical_crossentropy', sample_weight_mode='temporal') # keras model interfaces self.kerasEncoderModel = Model(questions, [encoded, h, c]) encoded = Input(shape=(None, 256)) hMemCells = Input(shape=(256, )) cMemCells = Input(shape=(256, )) decoded, h, c = decoder(embeddedAnswers, initial_state=[hMemCells, cMemCells]) attention = Activation('softmax')(dot([encoded, decoded], axes=[2, 2])) context = dot([attention, encoded], axes=[2, 1]) features = concatenate([decoded, context]) distributions = classifierLayer2(classifierLayer1(features)) self.kerasDecoderModel = Model( [answers, encoded, hMemCells, cMemCells], [distributions, h, c])
def cross_modal_attention(x, y): ''' . stands for dot product * stands for elemwise multiplication {} stands for concatenation m1 = x . transpose(y) || m2 = y . transpose(x) n1 = softmax(m1) || n2 = softmax(m2) o1 = n1 . y || o2 = m2 . x a1 = o1 * x || a2 = o2 * y return {a1, a2} ''' m1 = dot([x, y], axes=[2, 2]) n1 = Activation('softmax')(m1) o1 = dot([n1, y], axes=[2, 1]) a1 = multiply([o1, x]) return a1
def _get_model2(self): x1 = Input(shape=(10, )) x2 = Input(shape=(10, )) y = dot([Dense(10)(x1), Dense(10)(x2)], axes=1) model = Model(inputs=[x1, x2], outputs=y) model.compile(loss="mse", optimizer="adam") wrapped = OracleWrapper(model, BiasedReweightingPolicy(), score="loss") x = [np.random.rand(16, 10), np.random.rand(16, 10)] y = np.random.rand(16, 1) return model, wrapped, x, y
def finetuning_mono2mix(vocal_model, mix_model, num_frame,num_neg_artist, num_pos_track): anchor = Input(shape=(num_frame,config.n_mels)) pos_items = [Input(shape=(num_frame, config.n_mels)) for i in range(num_pos_track)] neg_items = [Input(shape=(num_frame, config.n_mels)) for i in range(num_neg_artist)] anchor_out = vocal_model(anchor) # anchor_out = mix_model(anchor) pos_outs = [mix_model(pos_item) for pos_item in pos_items] neg_outs = [mix_model(neg_item) for neg_item in neg_items] ### cosine pos_dists = [dot([anchor_out, pos_out], axes=1, normalize=True) for pos_out in pos_outs] neg_dists = [dot([anchor_out, neg_out], axes=1, normalize=True) for neg_out in neg_outs] all_dists = concatenate(pos_dists + neg_dists) outputs = Activation('linear', name='siamese')(all_dists) ''' # euc distance norm = Lambda(lambda x: K.l2_normalize(x, axis=1), name='l2_norm') anchor_out = norm(anchor_out) pos_outs = [norm(pos_out) for pos_out in pos_outs] neg_outs = [norm(neg_out) for neg_out in neg_outs] distance = Lambda(euclidean_dist, output_shape=euclidean_dist_output_shape, name='euclidean') pos_dists = [distance([anchor_out, pos_out]) for pos_out in pos_outs] neg_dists = [distance([anchor_out, neg_out]) for neg_out in neg_outs] outputs = concatenate(pos_dists + neg_dists) ''' ''' distance = Lambda(euclidean_dist, output_shape=euclidean_dist_output_shape, name='euclidean') pos_dist = distance([anchor_out, pos_outs[0]]) model = Model(inputs=[anchor]+ pos_items + neg_items, outputs=[outputs, pos_dist]) ''' model = Model(inputs=[anchor]+ pos_items + neg_items, outputs=outputs) return model
def make_seq2seq_models(self,x_train,y_train): input=Input(shape=(1, x_train.shape[1])) output=Input(shape=(1, y_train.shape[1])) n_hidden = 50 encoder_stack_h, encoder_last_h, encoder_last_c = LSTM( n_hidden, activation='elu', dropout=0.2, return_sequences=True, return_state=True)(input) encoder_last_h = BatchNormalization(momentum=0.1)(encoder_last_h) encoder_last_c = BatchNormalization(momentum=0.1)(encoder_last_c) decoder = RepeatVector(self.len_pred)(encoder_last_h) decoder_stack_h, decoder_last_h, decoder_last_c = LSTM(n_hidden, activation='elu', dropout=0.2,return_state=True, return_sequences=True)(decoder, initial_state=[encoder_last_h, encoder_last_c]) attention = dot([decoder_stack_h, encoder_stack_h], axes=[2, 2]) attention = Activation('softmax')(attention) context = dot([attention, encoder_stack_h], axes=[2,1]) context = BatchNormalization(momentum=0.6)(context) decoder_combined_context = concatenate([context, decoder_stack_h]) out = TimeDistributed(Dense(1))(decoder_combined_context) model = Model(inputs=input, outputs=out) opt = Adam(lr=0.001) model.compile(loss='mae', optimizer=opt, metrics=['mse']) print(model.summary()) return model
def Build_Attention_layer(Parametre_layer, encoder, decoder): if Parametre_layer["type_attention"] == "Luong": # the luong's attention attention = L.dot([decoder[0], encoder], axes=[2, 2]) attention = L.Activation('softmax')(attention) context = L.dot([attention, encoder], axes=[2, 1]) decoder_combined_context = K.concatenate([context, decoder[0]]) elif Parametre_layer["type_attention"] == "Luong_keras": # the luong's attention context_vector = L.Attention( use_scale=Parametre_layer["use_scale"], causal=Parametre_layer["use_self_attention"], dropout=Parametre_layer["dropout"])([decoder[0], encoder]) decoder_combined_context = K.concatenate([context_vector, decoder[0]]) elif Parametre_layer["type_attention"] == "Bah_keras": #we are going to use the AditiveAttention = bahd of keras context_vector = L.AdditiveAttention( use_scale=Parametre_layer["use_scale"], causal=Parametre_layer["use_self_attention"], dropout=Parametre_layer["dropout"])([decoder[0], encoder]) decoder_combined_context = K.concatenate([context_vector, decoder[0]]) return decoder_combined_context
def __init__(self, *args, **kwargs): self.model = Sequential([ Dense(10, activation="relu", input_shape=(2, )), Dense(10, activation="relu"), Dense(2) ]) self.model.compile("sgd", "mse", metrics=["mae"]) x1 = Input(shape=(10, )) x2 = Input(shape=(10, )) y = dot([Dense(10)(x1), Dense(10)(x2)], axes=1) self.model2 = Model(inputs=[x1, x2], outputs=y) self.model2.compile(loss="mse", optimizer="adam") super(TestTraining, self).__init__(*args, **kwargs)