예제 #1
0
def elsa_doc_model(hidden_dim=64, dropout=0.5, mode='train'):
    I_en = Input(shape=(nb_maxlen[0], nb_feature[1]), dtype='float32')
    en_out = AttentionWeightedAverage()(I_en)
    I_ot = Input(shape=(nb_maxlen[1], nb_feature[0]), dtype='float32')
    jp_out = AttentionWeightedAverage()(I_ot)
    O_to = concatenate([jp_out, en_out])
    O_to = Dense(hidden_dim, activation='selu')(O_to)
    if mode == 'train':
        O_to = Dropout(dropout)(O_to)
    O_out = Dense(1, activation='sigmoid', name='softmax')(O_to)
    model = Model(inputs=[I_ot, I_en], outputs=O_out)
    return model
예제 #2
0
    def __init__(self, embed_size, max_features, maxlen, embedding_matrix,
                 num_features):
        input1 = Input(shape=(maxlen, ))
        model1 = Embedding(max_features,
                           embed_size,
                           weights=[embedding_matrix],
                           trainable=False)(input1)
        model1 = Bidirectional(
            LSTM(300,
                 return_sequences=True,
                 dropout=0.1,
                 recurrent_dropout=0.1))(model1)
        # model1 = GlobalMaxPool1D()(model1)
        model1 = AttentionWeightedAverage()(model1)
        model1 = Dense(300, activation="relu")(model1)
        model1 = Dropout(0.1)(model1)

        input2 = Input(shape=(num_features, ))
        model2 = Dense(300, activation="relu")(input2)
        model2 = Dropout(0.1)(model2)

        merged = Add()([model1, model2])
        merged = BatchNormalization()(merged)
        merged = Dense(300)(merged)
        merged = PReLU()(merged)
        merged = Dropout(0.1)(merged)
        # merged = Dropout(0.1)(merged)
        out = Dense(6, activation="sigmoid")(merged)
        self.model = Model(inputs=[input1, input2], outputs=out)
        self.model.compile(loss='binary_crossentropy',
                           optimizer='Adam',
                           metrics=['accuracy'])
예제 #3
0
 def __init__(self, embed_size, max_features, maxlen, embedding_matrix):
     inp = Input(shape=(maxlen,))
     x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
     x = Bidirectional(GRU(300, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
     # x = GlobalMaxPool1D()(x)
     x = AttentionWeightedAverage()(x)
     x = Dense(300, activation="relu")(x)
     x = Dropout(0.1)(x)
     x = Dense(6, activation="sigmoid")(x)
     self.model = Model(inputs=inp, outputs=x)
     # optimizer = optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0, amsgrad=False)
     self.model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
예제 #4
0
 def __init__(self, embed_size, max_features, maxlen, embedding_matrix,
              num_features):
     input1 = Input(shape=(maxlen, ))
     model1 = Embedding(max_features,
                        embed_size,
                        weights=[embedding_matrix],
                        trainable=False)(input1)
     model1 = Bidirectional(LSTM(300, return_sequences=True))(model1)
     model1 = AttentionWeightedAverage()(model1)
     # model1 = GlobalMaxPool1D()(model1)
     model1 = Dense(300, activation="relu")(model1)
     model1 = Dropout(0.5)(model1)
     out = Dense(6, activation="sigmoid")(model1)
     self.model = Model(inputs=input1, outputs=out)
     self.model.compile(loss='binary_crossentropy',
                        optimizer='Nadam',
                        metrics=['accuracy'])
예제 #5
0
def build_word_model(DENSE_UNITS=DENSE_UNITS,
                     LEARNING_RATE=LEARNING_RATE,
                     ACTIVATION=DENSE_ACTIVATION,
                     VOCAB_SIZE=VOCAB_SIZE,
                     EMBED_DIM=EMBED_DIM,
                     OPTIMIZER=OPTIMIZER,
                     MOMENTUM=MOMENTUM,
                     LEN_TWEET=LEN_TWEET,
                     MAX_NUM_TWEETS=MAX_NUM_TWEETS,
                     GRU_UNITS=GRU_UNITS,
                     L2_REG=L2_REG,
                     NUM_LABELS=NUM_LABELS):
    """
    Build the word_model where weights can be loaded afterwards.
    Allows for word level attention visualization.
    """

    #Word layer
    word_input = Input(shape=(LEN_TWEET, ), name="word_input", dtype="uint16")

    word_embedding = Embedding(input_dim=VOCAB_SIZE,
                               output_dim=EMBED_DIM,
                               input_length=LEN_TWEET)(word_input)

    word_encoding = Bidirectional(
        GRU(units=GRU_UNITS,
            input_shape=(MAX_NUM_TWEETS, EMBED_DIM),
            return_sequences=True,
            kernel_regularizer=L2_REG))(word_embedding)

    word_dense = TimeDistributed(
        Dense(DENSE_UNITS, activation=ACTIVATION),
        name='word_dense')(word_encoding)  #Name layer to extract for viz

    word_att = AttentionWeightedAverage(name='word_att')(word_dense)
    word_model = Model(word_input, word_att)

    return word_model

    #Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=OPTIMIZER(momentum=MOMENTUM, lr=LEARNING_RATE),
                  metrics=['acc'])

    return word_model
예제 #6
0
def elsa_architecture(nb_classes,
                      nb_tokens,
                      maxlen,
                      feature_output=False,
                      embed_dropout_rate=0,
                      final_dropout_rate=0,
                      embed_l2=1E-6,
                      return_attention=False,
                      load_embedding=False,
                      pre_embedding=None,
                      high=False,
                      test=False,
                      LSTM_drop=0.5,
                      LSTM_hidden=512):
    """
    Returns the DeepMoji architecture uninitialized and
    without using the pretrained model weights.
    # Arguments:
        nb_classes: Number of classes in the dataset.
        nb_tokens: Number of tokens in the dataset (i.e. vocabulary size).
        maxlen: Maximum length of a token.
        feature_output: If True the model returns the penultimate
                        feature vector rather than Softmax probabilities
                        (defaults to False).
        embed_dropout_rate: Dropout rate for the embedding layer.
        final_dropout_rate: Dropout rate for the final Softmax layer.
        embed_l2: L2 regularization for the embedding layerl.
        high: use or not the highway network
    # Returns:
        Model with the given parameters.
    """
    class NonMasking(Layer):
        def __init__(self, **kwargs):
            self.supports_masking = True
            super(NonMasking, self).__init__(**kwargs)

        def build(self, input_shape):
            input_shape = input_shape

        def compute_mask(self, input, input_mask=None):
            # do not pass the mask to the next layers
            return None

        def call(self, x, mask=None):
            return x

        def get_output_shape_for(self, input_shape):
            return input_shape

    # define embedding layer that turns word tokens into vectors
    # an activation function is used to bound the values of the embedding
    model_input = Input(shape=(maxlen, ), dtype='int32')
    embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None
    if not load_embedding and pre_embedding is None:
        embed = Embedding(input_dim=nb_tokens,
                          output_dim=300,
                          mask_zero=True,
                          input_length=maxlen,
                          embeddings_regularizer=embed_reg,
                          name='embedding')
    else:
        embed = Embedding(input_dim=nb_tokens,
                          output_dim=300,
                          mask_zero=True,
                          input_length=maxlen,
                          weights=[pre_embedding],
                          embeddings_regularizer=embed_reg,
                          trainable=True,
                          name='embedding')
    if high:
        x = NonMasking()(embed(model_input))
    else:
        x = embed(model_input)
    x = Activation('tanh')(x)

    # entire embedding channels are dropped out instead of the
    # normal Keras embedding dropout, which drops all channels for entire words
    # many of the datasets contain so few words that losing one or more words can alter the emotions completely
    if not test and embed_dropout_rate != 0:
        embed_drop = SpatialDropout1D(embed_dropout_rate, name='embed_drop')
        x = embed_drop(x)

    # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
    # ordering of the way the merge is done is important for consistency with the pretrained model
    lstm_0_output = Bidirectional(LSTM(LSTM_hidden,
                                       return_sequences=True,
                                       dropout=0.0 if test else LSTM_drop),
                                  name="bi_lstm_0")(x)
    lstm_1_output = Bidirectional(LSTM(LSTM_hidden,
                                       return_sequences=True,
                                       dropout=0.0 if test else LSTM_drop),
                                  name="bi_lstm_1")(lstm_0_output)
    x = concatenate([lstm_1_output, lstm_0_output, x])
    if high:
        x = TimeDistributed(Highway(activation='tanh', name="high"))(x)
    # if return_attention is True in AttentionWeightedAverage, an additional tensor
    # representing the weight at each timestep is returned
    weights = None
    x = AttentionWeightedAverage(name='attlayer',
                                 return_attention=return_attention)(x)
    #x = MaskAverage(name='attlayer', return_attention=return_attention)(x)
    if return_attention:
        x, weights = x

    if not feature_output:
        # output class probabilities
        if not test and final_dropout_rate != 0:
            x = Dropout(final_dropout_rate)(x)

        if nb_classes > 2:
            outputs = [
                Dense(nb_classes, activation='softmax', name='softmax')(x)
            ]
        else:
            outputs = [Dense(1, activation='sigmoid', name='softmax')(x)]
    else:
        # output penultimate feature vector
        outputs = [x]

    if return_attention:
        # add the attention weights to the outputs if required
        outputs.append(weights)

    return Model(inputs=[model_input], outputs=outputs)
예제 #7
0
def deepmoji_architecture(nb_classes,
                          nb_tokens,
                          maxlen,
                          feature_output=False,
                          embed_dropout_rate=0,
                          final_dropout_rate=0,
                          embed_l2=1E-6,
                          return_attention=False):
    """
    Returns the DeepMoji architecture uninitialized and
    without using the pretrained model weights.

    # Arguments:
        nb_classes: Number of classes in the dataset.
        nb_tokens: Number of tokens in the dataset (i.e. vocabulary size).
        maxlen: Maximum length of a token.
        feature_output: If True the model returns the penultimate
                        feature vector rather than Softmax probabilities
                        (defaults to False).
        embed_dropout_rate: Dropout rate for the embedding layer.
        final_dropout_rate: Dropout rate for the final Softmax layer.
        embed_l2: L2 regularization for the embedding layerl.

    # Returns:
        Model with the given parameters.
    """
    # define embedding layer that turns word tokens into vectors
    # an activation function is used to bound the values of the embedding
    model_input = Input(shape=(maxlen, ), dtype='int32')
    embed_reg = L1L2(l2=embed_l2) if embed_l2 != 0 else None
    embed = Embedding(input_dim=nb_tokens,
                      output_dim=256,
                      mask_zero=True,
                      input_length=maxlen,
                      embeddings_regularizer=embed_reg,
                      name='embedding')
    x = embed(model_input)
    x = Activation('tanh')(x)

    # entire embedding channels are dropped out instead of the
    # normal Keras embedding dropout, which drops all channels for entire words
    # many of the datasets contain so few words that losing one or more words can alter the emotions completely
    if embed_dropout_rate != 0:
        embed_drop = SpatialDropout1D(embed_dropout_rate, name='embed_drop')
        x = embed_drop(x)

    # skip-connection from embedding to output eases gradient-flow and allows access to lower-level features
    # ordering of the way the merge is done is important for consistency with the pretrained model
    lstm_0_output = Bidirectional(LSTM(512, return_sequences=True),
                                  name="bi_lstm_0")(x)
    lstm_1_output = Bidirectional(LSTM(512, return_sequences=True),
                                  name="bi_lstm_1")(lstm_0_output)
    x = concatenate([lstm_1_output, lstm_0_output, x])

    # if return_attention is True in AttentionWeightedAverage, an additional tensor
    # representing the weight at each timestep is returned
    weights = None
    x = AttentionWeightedAverage(name='attlayer',
                                 return_attention=return_attention)(x)
    if return_attention:
        x, weights = x

    if feature_output == False:
        # output class probabilities
        if final_dropout_rate != 0:
            x = Dropout(final_dropout_rate)(x)

        if nb_classes > 2:
            outputs = [
                Dense(nb_classes, activation='softmax', name='softmax')(x)
            ]
        elif nb_classes == 2:
            outputs = [Dense(2, activation='softmax', name='softmax')(x)]
        else:
            outputs = [
                Dense(1, activation='tanh', name='softmax')(x)
            ]  #HERE WE USE NB_CLASSES==0 TO ADJUST THE MODEL TO A REGRESSION TASK-----------
    else:
        # output penultimate feature vector
        outputs = [x]

    if return_attention:
        # add the attention weights to the outputs if required
        outputs.append(weights)

    return Model(inputs=[model_input], outputs=outputs, name="DeepMoji")
예제 #8
0
def build_full_model(DENSE_UNITS=DENSE_UNITS,
                     LEARNING_RATE=LEARNING_RATE,
                     ACTIVATION=DENSE_ACTIVATION,
                     VOCAB_SIZE=VOCAB_SIZE,
                     EMBED_DIM=EMBED_DIM,
                     OPTIMIZER=OPTIMIZER,
                     MOMENTUM=MOMENTUM,
                     LEN_TWEET=LEN_TWEET,
                     MAX_NUM_TWEETS=MAX_NUM_TWEETS,
                     GRU_UNITS=GRU_UNITS,
                     L2_REG=L2_REG,
                     NUM_LABELS=NUM_LABELS,
                     save_word_model=False):
    """
    Model architecture for the Hierarchical Attention Network.
    Create a list as
    word_model_containter = [0]
    before calling build_model(save_word_model = True)
    to extract the word_model preceding the tweet level layers.
    (An ugly hack)
    """

    #Word layer
    word_input = Input(shape=(LEN_TWEET, ), name="word_input", dtype="uint16")

    word_embedding = Embedding(input_dim=VOCAB_SIZE,
                               output_dim=EMBED_DIM,
                               input_length=LEN_TWEET)(word_input)

    word_encoding = Bidirectional(
        GRU(units=GRU_UNITS,
            input_shape=(MAX_NUM_TWEETS, EMBED_DIM),
            return_sequences=True,
            kernel_regularizer=L2_REG))(word_embedding)

    word_dense = TimeDistributed(
        Dense(DENSE_UNITS, activation=ACTIVATION),
        name='word_dense')(word_encoding)  #Name layer to extract for viz

    word_att = AttentionWeightedAverage(name='word_att')(word_dense)
    word_model = Model(word_input, word_att)

    if save_word_model:  #hacks for saving word_model
        print('Saving Word Model')
        word_model_container = [word_model]

    #Sentence layer
    tweet_input = Input(shape=(MAX_NUM_TWEETS, LEN_TWEET), dtype="int32")

    tweet_encoding = TimeDistributed(word_model)(tweet_input)

    tweet_lstm = Bidirectional(
        GRU(units=GRU_UNITS, return_sequences=True,
            kernel_regularizer=L2_REG))(tweet_encoding)

    tweet_dense = TimeDistributed(Dense(DENSE_UNITS, activation=ACTIVATION),
                                  name='tweet_dense')(tweet_lstm)

    tweet_att = AttentionWeightedAverage(name='tweet_att')(tweet_dense)
    preds = Dense(NUM_LABELS, activation='softmax')(tweet_att)
    model = Model(tweet_input, preds)

    #Compile model
    model.compile(loss='categorical_crossentropy',
                  optimizer=OPTIMIZER(momentum=MOMENTUM, lr=LEARNING_RATE),
                  metrics=['acc'])

    return model