def decomposable_attention(pretrained_embedding=config.word_embed_weights, projection_dim=300, projection_hidden=0, projection_dropout=0.2, compare_dim=500, compare_dropout=0.2, dense_dim=300, dense_dropout=0.2, lr=1e-3, activation='elu', maxlen=MAX_LEN): # Based on: https://arxiv.org/abs/1606.01933 magic_input = Input(shape=(len(config.feats),)) magic_dense = BatchNormalization()(magic_input) magic_dense = Dense(64, activation='relu')(magic_dense) q1 = Input(name='q1', shape=(maxlen,)) q2 = Input(name='q2', shape=(maxlen,)) # Embedding embedding = create_pretrained_embedding(pretrained_embedding, mask_zero=False) q1_embed = embedding(q1) q2_embed = embedding(q2) # Projection projection_layers = [] if projection_hidden > 0: projection_layers.extend([ Dense(projection_hidden, activation=activation), Dropout(rate=projection_dropout), ]) projection_layers.extend([ Dense(projection_dim, activation=None), Dropout(rate=projection_dropout), ]) q1_encoded = time_distributed(q1_embed, projection_layers) q2_encoded = time_distributed(q2_embed, projection_layers) # Attention q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compare q1_combined = Concatenate()( [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) q2_combined = Concatenate()( [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) compare_layers = [ Dense(compare_dim, activation=activation), Dropout(compare_dropout), Dense(compare_dim, activation=activation), Dropout(compare_dropout), ] q1_compare = time_distributed(q1_combined, compare_layers) q2_compare = time_distributed(q2_combined, compare_layers) # # Aggregate # q1_rep = apply_multiple(q1_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) # q2_rep = apply_multiple(q2_compare, [GlobalAvgPool1D(), GlobalMaxPool1D()]) q1_rep_max = MyMaxPool(axis=1)(q1_compare) q2_rep_max = MyMaxPool(axis=1)(q2_compare) cro_max = cross(q1_rep_max,q2_rep_max,compare_dim) dist = distence(q1_rep_max,q2_rep_max) #dense = cro dense = Concatenate()([ q1_rep_max, q2_rep_max,cro_max,dist, ]) #merged = Concatenate()([q1_rep, q2_rep,magic_dense]) dense = BatchNormalization()(dense) dense = Dense(dense_dim, activation=activation)(dense) dense = Dropout(dense_dropout)(dense) dense = BatchNormalization()(dense) dense = Dense(dense_dim, activation=activation)(dense) dense = Dropout(dense_dropout)(dense) out_ = Dense(1, activation='sigmoid')(dense) model = Model(inputs=[q1, q2,magic_input], outputs=out_) model.compile(optimizer=Adam(lr=lr), loss='binary_crossentropy', metrics=['accuracy']) model.summary() return model
def bma_gru(): # The embedding layer containing the word vectors # Embedding emb_layer = create_pretrained_embedding(config.char_embed_weights, mask_zero=True) emb_layer_word = create_pretrained_embedding(config.word_embed_weights, mask_zero=True) # Model variables n_hidden = 128 # Define the shared model x = Sequential() x.add(emb_layer) # # LSTM x.add(Bidirectional(LSTM(n_hidden, return_sequences=True))) x.add(Bidirectional(LSTM(n_hidden, return_sequences=True))) x.add(BatchNormalization()) x.add(MyMaxPool(axis=1)) shared_model = x x2 = Sequential() x2.add(emb_layer_word) # # LSTM x2.add(Bidirectional(LSTM(10, return_sequences=True))) #x2.add(Bidirectional(LSTM(n_hidden,return_sequences=True))) x2.add(BatchNormalization()) x2.add(MyMaxPool(axis=1)) shared_model2 = x2 # The visible layer magic_input = Input(shape=(len(config.feats), )) magic_dense = BatchNormalization()(magic_input) magic_dense = Dense(64, activation='relu')(magic_dense) left_input = Input(shape=(config.word_maxlen, ), dtype='int32') right_input = Input(shape=(config.word_maxlen, ), dtype='int32') w1 = Input(shape=(config.word_maxlen, ), dtype='int32') w2 = Input(shape=(config.word_maxlen, ), dtype='int32') left = shared_model(left_input) right = shared_model(right_input) left_w = shared_model2(w1) right_w = shared_model2(w2) # Pack it all up into a Manhattan Distance model malstm_distance = Lambda( lambda x: K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)), output_shape=(1, ))([left, right]) malstm_distance2 = Lambda( lambda x: K.exp(-K.sum(K.abs(x[0] - x[1]), axis=1, keepdims=True)), output_shape=(1, ))([left_w, right_w]) cro = cross(left, right, n_hidden * 2) cro2 = cross(left_w, right_w, n_hidden * 2) #if config.nofeats: merge = concatenate([left, right, cro, malstm_distance2, magic_dense]) # , magic_dense, malstm_distance]) # else: # merge = concatenate([ cro,cro2]) # # The MLP that determines the outcome x = Dropout(0.2)(merge) x = BatchNormalization()(x) x = Dense(300, activation='relu')(x) x = Dropout(0.2)(x) x = BatchNormalization()(x) pred = Dense(1, activation='sigmoid')(x) model = Model(inputs=[left_input, right_input, w1, w2, magic_input], outputs=pred) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=[ Precision, Recall, F1, ]) model.summary() shared_model.summary() return model
def esim(pretrained_embedding=config.word_embed_weights, maxlen=MAX_LEN, lstm_dim=300, dense_dim=300, dense_dropout=0.2): # Based on arXiv:1609.06038 magic_input = Input(shape=(len(config.feats),)) magic_dense = BatchNormalization()(magic_input) magic_dense = Dense(64, activation='elu')(magic_dense) q1 = Input(name='q1', shape=(maxlen,)) q2 = Input(name='q2', shape=(maxlen,)) q1_w = Input(name='q1_w', shape=(maxlen,)) q2_w = Input(name='q2_w', shape=(maxlen,)) # Embedding emb_layer = create_pretrained_embedding( config.char_embed_weights, mask_zero=True) emb_layer_word = create_pretrained_embedding( config.word_embed_weights, mask_zero=True) # Encode encode = Sequential() encode.add(emb_layer) encode.add(BatchNormalization(axis=2)) encode.add(Bidirectional(LSTM(lstm_dim, return_sequences=True))) encode2 = Sequential() encode2.add(emb_layer_word) encode2.add(BatchNormalization(axis=2)) encode2.add(Bidirectional(LSTM(lstm_dim, return_sequences=True))) q1_encoded = encode(q1) q2_encoded = encode(q2) q1_w_encoded = encode2(q1_w) q2_w_encoded = encode2(q2_w) # Attention q1_aligned, q2_aligned = soft_attention_alignment(q1_encoded, q2_encoded) # Compose q1_combined = Concatenate()( [q1_encoded, q2_aligned, submult(q1_encoded, q2_aligned)]) q2_combined = Concatenate()( [q2_encoded, q1_aligned, submult(q2_encoded, q1_aligned)]) compose = Bidirectional(LSTM(lstm_dim, return_sequences=True)) q1_compare = compose(q1_combined) q2_compare = compose(q2_combined) # # Aggregate # q1_rep = apply_multiple(q1_compare, [MyMaxPool(axis=1), MyMeanPool(axis=1)]) # q2_rep = apply_multiple(q2_compare, [MyMaxPool(axis=1), MyMeanPool(axis=1)]) q1_rep = MyMaxPool(axis=1)(q1_compare) q2_rep = MyMaxPool(axis=1)(q2_compare) q1_w_rep = MyMaxPool(axis=1)(q1_w_encoded) q2_w_rep = MyMaxPool(axis=1)(q2_w_encoded) # Classifier cro = cross(q1_rep,q2_rep,lstm_dim*2) dist = distence(q1_rep,q2_rep) dist2 = distence(q1_w_rep,q2_w_rep) #dense = cro dense = Concatenate()([q1_rep, q2_rep,cro,dist,dist2,magic_dense]) dense = Dropout(dense_dropout)(dense) dense = Dense(dense_dim, activation='relu')(dense) dense = BatchNormalization()(dense) dense = Dropout(dense_dropout)(dense) out_ = Dense(1, activation='sigmoid')(dense) model = Model(inputs=[q1, q2,q1_w,q2_w,magic_input], outputs=out_) model.compile(loss='binary_crossentropy', optimizer="adam", metrics = [Precision,Recall,F1,]) model.summary() return model
def BMA_GRU(pretrained_embedding=config.word_embed_weights, maxlen=MAX_LEN, lstm_dim=300, dense_dim=300, dense_dropout=0.2, pool="max", mode='char+word'): # Based on arXiv:1609.06038 magic_input = Input(shape=(len(config.feats), )) magic_dense = BatchNormalization()(magic_input) magic_dense = Dense(64, activation='elu')(magic_dense) q1 = Input(name='q1', shape=(maxlen, )) q2 = Input(name='q2', shape=(maxlen, )) q1_w = Input(name='q1_w', shape=(maxlen, )) q2_w = Input(name='q2_w', shape=(maxlen, )) # Embedding emb_layer = create_pretrained_embedding(config.char_embed_weights, mask_zero=False) emb_layer_word = create_pretrained_embedding(config.word_embed_weights, mask_zero=False) # Encode encode = Sequential() encode.add(emb_layer) encode.add(BatchNormalization(axis=2)) encode.add(Bidirectional(CuDNNGRU(lstm_dim, return_sequences=True))) encode2 = Sequential() encode2.add(emb_layer_word) encode2.add(BatchNormalization(axis=2)) encode2.add(Bidirectional(CuDNNGRU(lstm_dim, return_sequences=True))) q1_encoded = encode(q1) q2_encoded = encode(q2) q1_w_encoded = encode2(q1_w) q2_w_encoded = encode2(q2_w) att_flag = True q1_compare, q2_compare = esim_blok(q1_encoded, q2_encoded, att_flag) q1_compare_w, q2_compare_w = esim_blok(q1_w_encoded, q2_w_encoded, att_flag) # q1_rep ,q2_rep = q1_encoded,q2_encoded # q1_w_rep , q2_w_rep = q1_w_encoded,q2_w_encoded # q1_rep ,q2_rep = q1_compare,q2_compare # q1_w_rep , q2_w_rep = q1_compare_w,q2_compare_w if pool == 'max': q1_rep = MyMaxPool(axis=1)(q1_compare) q2_rep = MyMaxPool(axis=1)(q2_compare) q1_w_rep = MyMaxPool(axis=1)(q1_compare_w) q2_w_rep = MyMaxPool(axis=1)(q2_compare_w) elif pool == 'mean': q1_rep = MyMeanPool(axis=1)(q1_compare) q2_rep = MyMeanPool(axis=1)(q2_compare) q1_w_rep = MyMeanPool(axis=1)(q1_compare_w) q2_w_rep = MyMeanPool(axis=1)(q2_compare_w) else: q1_rep = Attention(maxlen)(q1_compare) q2_rep = Attention(maxlen)(q2_compare) q1_w_rep = Attention(maxlen)(q1_compare_w) q2_w_rep = Attention(maxlen)(q2_compare_w) # # Aggregate # q1_rep = apply_multiple(q1_compare, [MyMaxPool(axis=1), MyMeanPool(axis=1)]) # q2_rep = apply_multiple(q2_compare, [MyMaxPool(axis=1), MyMeanPool(axis=1)]) # Classifier cro = cross(q1_rep, q2_rep, lstm_dim * 2) dist = distence(q1_rep, q2_rep) dist2 = distence(q1_w_rep, q2_w_rep) #dense = cro if mode == "char": dense = Concatenate()([ q1_rep, q2_rep, ]) elif mode == "word": dense = Concatenate()([q1_w_rep, q2_w_rep]) else: dense = Concatenate()([q1_rep, q2_rep, q1_w_rep, q2_w_rep]) dense = Dropout(dense_dropout)(dense) dense = Dense(dense_dim, activation='relu')(dense) dense = BatchNormalization()(dense) dense = Dropout(dense_dropout)(dense) out_ = Dense(1, activation='sigmoid')(dense) model = Model(inputs=[q1, q2, q1_w, q2_w, magic_input], outputs=out_) model.compile(loss='binary_crossentropy', optimizer="adam", metrics=[ Precision, Recall, F1, ]) model.summary() return model