def elsa_doc_model(hidden_dim=64, dropout=0.5, mode='train'): I_en = Input(shape=(nb_maxlen[0], nb_feature[1]), dtype='float32') en_out = AttentionWeightedAverage()(I_en) I_ot = Input(shape=(nb_maxlen[1], nb_feature[0]), dtype='float32') jp_out = AttentionWeightedAverage()(I_ot) O_to = concatenate([jp_out, en_out]) O_to = Dense(hidden_dim, activation='selu')(O_to) if mode == 'train': O_to = Dropout(dropout)(O_to) O_out = Dense(1, activation='sigmoid', name='softmax')(O_to) model = Model(inputs=[I_ot, I_en], outputs=O_out) return model
def __init__(self, embed_size, max_features, maxlen, embedding_matrix, num_features): input1 = Input(shape=(maxlen, )) model1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(input1) model1 = Bidirectional( LSTM(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(model1) # model1 = GlobalMaxPool1D()(model1) model1 = AttentionWeightedAverage()(model1) model1 = Dense(300, activation="relu")(model1) model1 = Dropout(0.1)(model1) input2 = Input(shape=(num_features, )) model2 = Dense(300, activation="relu")(input2) model2 = Dropout(0.1)(model2) merged = Add()([model1, model2]) merged = BatchNormalization()(merged) merged = Dense(300)(merged) merged = PReLU()(merged) merged = Dropout(0.1)(merged) # merged = Dropout(0.1)(merged) out = Dense(6, activation="sigmoid")(merged) self.model = Model(inputs=[input1, input2], outputs=out) self.model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
def __init__(self, embed_size, max_features, maxlen, embedding_matrix): inp = Input(shape=(maxlen,)) x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp) x = Bidirectional(GRU(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x) # x = GlobalMaxPool1D()(x) x = AttentionWeightedAverage()(x) x = Dense(300, activation="relu")(x) x = Dropout(0.1)(x) x = Dense(6, activation="sigmoid")(x) self.model = Model(inputs=inp, outputs=x) # optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False) self.model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
def __init__(self, embed_size, max_features, maxlen, embedding_matrix, num_features): input1 = Input(shape=(maxlen, )) model1 = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(input1) model1 = Bidirectional(LSTM(300, return_sequences=True))(model1) model1 = AttentionWeightedAverage()(model1) # model1 = GlobalMaxPool1D()(model1) model1 = Dense(300, activation="relu")(model1) model1 = Dropout(0.5)(model1) out = Dense(6, activation="sigmoid")(model1) self.model = Model(inputs=input1, outputs=out) self.model.compile(loss='binary_crossentropy', optimizer='Nadam', metrics=['accuracy'])
def build_word_model(DENSE_UNITS=DENSE_UNITS, LEARNING_RATE=LEARNING_RATE, ACTIVATION=DENSE_ACTIVATION, VOCAB_SIZE=VOCAB_SIZE, EMBED_DIM=EMBED_DIM, OPTIMIZER=OPTIMIZER, MOMENTUM=MOMENTUM, LEN_TWEET=LEN_TWEET, MAX_NUM_TWEETS=MAX_NUM_TWEETS, GRU_UNITS=GRU_UNITS, L2_REG=L2_REG, NUM_LABELS=NUM_LABELS): """ Build the word_model where weights can be loaded afterwards. Allows for word level attention visualization. """ #Word layer word_input = Input(shape=(LEN_TWEET, ), name="word_input", dtype="uint16") word_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=LEN_TWEET)(word_input) word_encoding = Bidirectional( GRU(units=GRU_UNITS, input_shape=(MAX_NUM_TWEETS, EMBED_DIM), return_sequences=True, kernel_regularizer=L2_REG))(word_embedding) word_dense = TimeDistributed( Dense(DENSE_UNITS, activation=ACTIVATION), name='word_dense')(word_encoding) #Name layer to extract for viz word_att = AttentionWeightedAverage(name='word_att')(word_dense) word_model = Model(word_input, word_att) return word_model #Compile model model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER(momentum=MOMENTUM, lr=LEARNING_RATE), metrics=['acc']) return word_model
def elsa_architecture(nb_classes, nb_tokens, maxlen, feature_output=False, embed_dropout_rate=0, final_dropout_rate=0, embed_l2=1E-6, return_attention=False, load_embedding=False, pre_embedding=None, high=False, test=False, LSTM_drop=0.5, LSTM_hidden=512): """ Returns the DeepMoji architecture uninitialized and without using the pretrained model weights. # Arguments: nb_classes: Number of classes in the dataset. nb_tokens: Number of tokens in the dataset (i.e. vocabulary size). maxlen: Maximum length of a token. feature_output: If True the model returns the penultimate feature vector rather than Softmax probabilities (defaults to False). embed_dropout_rate: Dropout rate for the embedding layer. final_dropout_rate: Dropout rate for the final Softmax layer. embed_l2: L2 regularization for the embedding layerl. high: use or not the highway network # Returns: Model with the given parameters. """ class NonMasking(Layer): def __init__(self, **kwargs): self.supports_masking = True super(NonMasking, self).__init__(**kwargs) def build(self, input_shape): input_shape = input_shape def compute_mask(self, input, input_mask=None): # do not pass the mask to the next layers return None def call(self, x, mask=None): return x def get_output_shape_for(self, input_shape): return input_shape # define embedding layer that turns word tokens into vectors # an activation function is used to bound the values of the embedding model_input = Input(shape=(maxlen, ), dtype='int32') embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None if not load_embedding and pre_embedding is None: embed = Embedding(input_dim=nb_tokens, output_dim=300, mask_zero=True, input_length=maxlen, embeddings_regularizer=embed_reg, name='embedding') else: embed = Embedding(input_dim=nb_tokens, output_dim=300, mask_zero=True, input_length=maxlen, weights=[pre_embedding], embeddings_regularizer=embed_reg, trainable=True, name='embedding') if high: x = NonMasking()(embed(model_input)) else: x = embed(model_input) x = Activation('tanh')(x) # entire embedding channels are dropped out instead of the # normal Keras embedding dropout, which drops all channels for entire words # many of the datasets contain so few words that losing one or more words can alter the emotions completely if not test and embed_dropout_rate != 0: embed_drop = SpatialDropout1D(embed_dropout_rate, name='embed_drop') x = embed_drop(x) # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features # ordering of the way the merge is done is important for consistency with the pretrained model lstm_0_output = Bidirectional(LSTM(LSTM_hidden, return_sequences=True, dropout=0.0 if test else LSTM_drop), name="bi_lstm_0")(x) lstm_1_output = Bidirectional(LSTM(LSTM_hidden, return_sequences=True, dropout=0.0 if test else LSTM_drop), name="bi_lstm_1")(lstm_0_output) x = concatenate([lstm_1_output, lstm_0_output, x]) if high: x = TimeDistributed(Highway(activation='tanh', name="high"))(x) # if return_attention is True in AttentionWeightedAverage, an additional tensor # representing the weight at each timestep is returned weights = None x = AttentionWeightedAverage(name='attlayer', return_attention=return_attention)(x) #x = MaskAverage(name='attlayer', return_attention=return_attention)(x) if return_attention: x, weights = x if not feature_output: # output class probabilities if not test and final_dropout_rate != 0: x = Dropout(final_dropout_rate)(x) if nb_classes > 2: outputs = [ Dense(nb_classes, activation='softmax', name='softmax')(x) ] else: outputs = [Dense(1, activation='sigmoid', name='softmax')(x)] else: # output penultimate feature vector outputs = [x] if return_attention: # add the attention weights to the outputs if required outputs.append(weights) return Model(inputs=[model_input], outputs=outputs)
def deepmoji_architecture(nb_classes, nb_tokens, maxlen, feature_output=False, embed_dropout_rate=0, final_dropout_rate=0, embed_l2=1E-6, return_attention=False): """ Returns the DeepMoji architecture uninitialized and without using the pretrained model weights. # Arguments: nb_classes: Number of classes in the dataset. nb_tokens: Number of tokens in the dataset (i.e. vocabulary size). maxlen: Maximum length of a token. feature_output: If True the model returns the penultimate feature vector rather than Softmax probabilities (defaults to False). embed_dropout_rate: Dropout rate for the embedding layer. final_dropout_rate: Dropout rate for the final Softmax layer. embed_l2: L2 regularization for the embedding layerl. # Returns: Model with the given parameters. """ # define embedding layer that turns word tokens into vectors # an activation function is used to bound the values of the embedding model_input = Input(shape=(maxlen, ), dtype='int32') embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None embed = Embedding(input_dim=nb_tokens, output_dim=256, mask_zero=True, input_length=maxlen, embeddings_regularizer=embed_reg, name='embedding') x = embed(model_input) x = Activation('tanh')(x) # entire embedding channels are dropped out instead of the # normal Keras embedding dropout, which drops all channels for entire words # many of the datasets contain so few words that losing one or more words can alter the emotions completely if embed_dropout_rate != 0: embed_drop = SpatialDropout1D(embed_dropout_rate, name='embed_drop') x = embed_drop(x) # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features # ordering of the way the merge is done is important for consistency with the pretrained model lstm_0_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_0")(x) lstm_1_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_1")(lstm_0_output) x = concatenate([lstm_1_output, lstm_0_output, x]) # if return_attention is True in AttentionWeightedAverage, an additional tensor # representing the weight at each timestep is returned weights = None x = AttentionWeightedAverage(name='attlayer', return_attention=return_attention)(x) if return_attention: x, weights = x if feature_output == False: # output class probabilities if final_dropout_rate != 0: x = Dropout(final_dropout_rate)(x) if nb_classes > 2: outputs = [ Dense(nb_classes, activation='softmax', name='softmax')(x) ] elif nb_classes == 2: outputs = [Dense(2, activation='softmax', name='softmax')(x)] else: outputs = [ Dense(1, activation='tanh', name='softmax')(x) ] #HERE WE USE NB_CLASSES==0 TO ADJUST THE MODEL TO A REGRESSION TASK----------- else: # output penultimate feature vector outputs = [x] if return_attention: # add the attention weights to the outputs if required outputs.append(weights) return Model(inputs=[model_input], outputs=outputs, name="DeepMoji")
def build_full_model(DENSE_UNITS=DENSE_UNITS, LEARNING_RATE=LEARNING_RATE, ACTIVATION=DENSE_ACTIVATION, VOCAB_SIZE=VOCAB_SIZE, EMBED_DIM=EMBED_DIM, OPTIMIZER=OPTIMIZER, MOMENTUM=MOMENTUM, LEN_TWEET=LEN_TWEET, MAX_NUM_TWEETS=MAX_NUM_TWEETS, GRU_UNITS=GRU_UNITS, L2_REG=L2_REG, NUM_LABELS=NUM_LABELS, save_word_model=False): """ Model architecture for the Hierarchical Attention Network. Create a list as word_model_containter = [0] before calling build_model(save_word_model = True) to extract the word_model preceding the tweet level layers. (An ugly hack) """ #Word layer word_input = Input(shape=(LEN_TWEET, ), name="word_input", dtype="uint16") word_embedding = Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=LEN_TWEET)(word_input) word_encoding = Bidirectional( GRU(units=GRU_UNITS, input_shape=(MAX_NUM_TWEETS, EMBED_DIM), return_sequences=True, kernel_regularizer=L2_REG))(word_embedding) word_dense = TimeDistributed( Dense(DENSE_UNITS, activation=ACTIVATION), name='word_dense')(word_encoding) #Name layer to extract for viz word_att = AttentionWeightedAverage(name='word_att')(word_dense) word_model = Model(word_input, word_att) if save_word_model: #hacks for saving word_model print('Saving Word Model') word_model_container = [word_model] #Sentence layer tweet_input = Input(shape=(MAX_NUM_TWEETS, LEN_TWEET), dtype="int32") tweet_encoding = TimeDistributed(word_model)(tweet_input) tweet_lstm = Bidirectional( GRU(units=GRU_UNITS, return_sequences=True, kernel_regularizer=L2_REG))(tweet_encoding) tweet_dense = TimeDistributed(Dense(DENSE_UNITS, activation=ACTIVATION), name='tweet_dense')(tweet_lstm) tweet_att = AttentionWeightedAverage(name='tweet_att')(tweet_dense) preds = Dense(NUM_LABELS, activation='softmax')(tweet_att) model = Model(tweet_input, preds) #Compile model model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER(momentum=MOMENTUM, lr=LEARNING_RATE), metrics=['acc']) return model