예제 #1
0
파일: train.py 프로젝트: jasonaidm/IEmocap
final_train_audio = np.array(final_train_audio)
print('train_audio shape:', final_train_audio.shape)
print('train_text shape:', final_train_text.shape)
print('test_audio shape:', test_audio_data.shape)
print('test_text shape:', test_text_data.shape)
print('train_label shape:', final_train_label.shape)
print('test_label shape:', test_label.shape)
"""

# Audio branch
audio_input = Input(shape=(2250, 64))
mask_audio_input = Masking(mask_value=0.)(audio_input)
audio_l1 = Bidirectional(
    LSTM(128, return_sequences=True, recurrent_dropout=0.25,
         name='LSTM_audio'))(mask_audio_input)
audio_att = AttentionLayer()(audio_l1)
dropout_audio = Dropout(0.5)(audio_att)

audio_prediction = Dense(5, activation='softmax')(dropout_audio)
audio_model = Model(inputs=audio_input, outputs=audio_prediction)
inter_audio_model = Model(inputs=audio_input, outputs=audio_att)

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
audio_model.compile(loss='categorical_crossentropy',
                    optimizer=adam,
                    metrics=['accuracy'])

# Text Branch
text_input = Input(shape=(50, ))
em_text = Embedding(len(dic) + 1, 200, weights=[embed_matrix],
                    trainable=True)(text_input)
예제 #2
0
# LSTM layer
audio = LSTM(512,
             return_sequences=True,
             recurrent_dropout=0.25,
             name='LSTM_audio_1')(audio)

audio = LSTM(256,
             return_sequences=True,
             recurrent_dropout=0.25,
             name='LSTM_audio_2')(audio)

#frame_l1 = BatchNormalization()(frame_l1)

# attention layer
audio_weight = AttentionLayer()(audio)
audio_weight = Lambda(weight_expand)(audio_weight)
audio_vector = Lambda(weight_dot)([audio, audio_weight])
audio_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(audio_vector)

# dropout layer
dropout_audio = Dropout(0.5)(audio_feature_vector)
dense_audio_1 = Dense(128, activation='relu')(dropout_audio)
dropout_audio = Dropout(0.5)(dense_audio_1)

# decision-making
audio_prediction = Dense(numclass, activation='softmax')(dropout_audio)
audio_model = Model(inputs=[left_input, right_input], outputs=audio_prediction)
inter_audio = Model(inputs=[left_input, right_input],
                    outputs=audio_feature_vector)
def weight_dot(inputs):
    x = inputs[0]
    y = inputs[1]
    return x * y


# Contextual branch
context_input = Input(shape=(537, 64))
context_input = Masking(mask_value=0.)(context_input)
context_l1 = Bidirectional(
    LSTM(256,
         return_sequences=True,
         recurrent_dropout=0.25,
         name='contextual_LSTM'))(context_input)
context_weight = AttentionLayer()(context_l1)
context_weight_exp = Lambda(weight_expand)(context_weight)
context_attention = Lambda(weight_dot)([context_l1, context_weight_exp])
context_att = Lambda(lambda x: backend.sum(x, axis=1))(context_attention)
dropout_context = Dropout(0.25)(context_att)

# Original Branch
ori_input = Input(shape=(537, 64))
ori_input = Masking(mask_value=0.)(ori_input)
ori_l1 = Bidirectional(
    LSTM(256,
         return_sequences=True,
         recurrent_dropout=0.25,
         name='contextual_LSTM'))(ori_input)
ori_weight = AttentionLayer()(ori_l1)
ori_weight_exp = Lambda(weight_expand)(ori_weight)
예제 #4
0
def hierarchical_attention(
        max_seq,
        emb_weights=None,
        embedding_size=None,
        vocab_size=None,  # embedding
        recursive_class=GRU,
        word_rnnsize=100,  # rnn
        drop_wordemb=0.2,
        drop_wordrnnout=0.2):
    """
    Creates a model based on the Hierarchical Attention model according to : https://arxiv.org/abs/1606.02393
    inputs:
        maxSeq : max size for sentences
        embedding
            embWeights : numpy matrix with embedding values
            embeddingSize (if embWeights is None) : embedding size
            vocabSize (if embWeights is None) : vocabulary size
        Recursive Layers
            recursiveClass : class for recursive class. Default is GRU
            wordRnnSize : RNN size for word sequence
            sentenceRnnSize :  RNN size for sentence sequence
        Dense Layers
            wordDenseSize: dense layer at exit from RNN , on sentence at word level
            sentenceHiddenSize : dense layer at exit from RNN , on document at sentence level
        Dropout

    returns : Two models. They are the same, but the second contains multiple outputs that can be use to analyse attention.
    """

    # Sentence level logic

    # Input Layer
    words_inputs = Input(shape=(max_seq, ), dtype='int32', name='words_input')

    # Word embedding layer
    if emb_weights is None:
        emb = Embedding(vocab_size, embedding_size,
                        mask_zero=True)(words_inputs)
    else:
        emb = Embedding(emb_weights.shape[0],
                        emb_weights.shape[1],
                        mask_zero=True,
                        weights=[emb_weights],
                        trainable=False)(words_inputs)
    """
    if drop_wordemb != 0.0:
        emb = Dropout(drop_wordemb)(emb)
    """
    # RNN layer (GRU/LSTM/biLSTM)
    word_rnn = Bidirectional(recursive_class(word_rnnsize,
                                             return_sequences=True),
                             merge_mode='concat')(emb)
    # word_rnn = BatchNormalization()(word_rnn)

    if drop_wordrnnout > 0.0:
        word_rnn = Dropout(drop_wordrnnout)(word_rnn)

    sentence_att = AttentionLayer()(word_rnn)

    sentence_out = Dense(6, activation="softmax",
                         name="words_Out")(sentence_att)

    model = Model(words_inputs, sentence_out)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    """
    documentInputs = Input(shape=(None, maxSeq), dtype='int32', name='document_input')
    sentenceMasking = Masking(mask_value=0)(documentInputs)
    sentenceEmbbeding = TimeDistributed(modelSentence)(sentenceMasking)
    sentenceAttention = TimeDistributed(modelSentAttention)(sentenceMasking)
    sentenceRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')(
        sentenceEmbbeding)
    if dropSentenceRnnOut > 0.0:
        sentenceRnn = Dropout(dropSentenceRnnOut)(sentenceRnn)
    attentionSent = AttentionLayer()(sentenceRnn)

    documentEmb = merge([sentenceRnn, attentionSent], mode=lambda x: x[1] * x[0], output_shape=lambda x: x[0])
    documentEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]), name="att2")(documentEmb)
    documentOut = Dense(1, activation="sigmoid", name="documentOut")(documentEmb)

    model = Model(input=[documentInputs], output=[documentOut])
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    modelAttentionEv = Model(input=[documentInputs], output=[documentOut, sentenceAttention, attentionSent])
    modelAttentionEv.compile(loss='binary_crossentropy',
                             optimizer='rmsprop',
                             metrics=['accuracy'])
    """
    return model
예제 #5
0
def data_normal(x):
    min_max_scaler = preprocessing.MinMaxScaler()
    x = min_max_scaler.fit_transform(x)
    return x


# Text Branch
text_input = Input(shape=(98, ))
em_text = Embedding(len(dic) + 1, 200, weights=[embed_matrix],
                    trainable=True)(text_input)
mask_text_input = Masking(mask_value=0.)(em_text)
text_l1 = Bidirectional(
    LSTM(100, return_sequences=True, recurrent_dropout=0.25,
         name='LSTM_text'))(mask_text_input)
text_l1 = BatchNormalization()(text_l1)
text_weight = AttentionLayer()(text_l1)
text_weight = BatchNormalization()(text_weight)  #
text_weight_exp = Lambda(weight_expand)(text_weight)  #
text_attention = Lambda(weight_dot)([text_l1, text_weight_exp])  #
text_att = Lambda(lambda x: backend.sum(x, axis=1))(text_attention)  #
dropout_text = Dropout(0.5)(text_att)

text_prediction = Dense(numclass, activation='softmax')(dropout_text)
text_model = Model(inputs=text_input, outputs=text_prediction)
inter_text_hidden = Model(inputs=text_input,
                          outputs=[text_attention, text_weight])
inter_text_weight = Model(inputs=text_input, outputs=text_weight)

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
text_model.compile(loss='categorical_crossentropy',
                   optimizer=adam,
예제 #6
0
                    trainable=True)(text_input)
# masking layer
text = Masking(mask_value=0., name='ph1_mask')(em_text)
# LSTM layer
text = LSTM(512,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='ph1_LSTM_text_1')(text)
text = LSTM(256,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='ph1_LSTM_text_2')(text)
# batch normalization
#text_l1 = BatchNormalization(name=)(text_l1)
# attention layer
text_weight = AttentionLayer(name='ph1_att')(text)
text_weight = Lambda(weight_expand, name='ph1_lam1')(text_weight)
text_vector = Lambda(weight_dot, name='ph1_lam2')([text, text_weight])
text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1),
                             name='ph1_lam3')(text_vector)
# dropout layer
dropout_text = Dropout(0.25, name='ph1_drop1')(text_feature_vector)
dense_text_1 = Dense(128, activation='relu', name='ph1_dense')(dropout_text)
dropout_text = Dropout(0.25, name='ph1_drop2')(dense_text_1)
# decision-making
text_prediction = Dense(numclass, activation='softmax',
                        name='ph1_dec')(dropout_text)
text_model = Model(inputs=text_input,
                   outputs=text_prediction,
                   name='ph1_model')
#inter_text = Model(inputs = text_input, outputs = text_feature_vector)
예제 #7
0
            r_3[str(result[i])] += 1
        elif test_label[i] == 4:
            r_4[str(result[i])] += 1
        i += 1
    return r_0, r_1, r_2, r_3, r_4


# Audio BLSTM

audio_input = Input(shape=(2250, 64))
mask_input = Masking(mask_value=0.)(audio_input)
audio_l1 = Bidirectional(
    LSTM(100, return_sequences=True, recurrent_dropout=0.25,
         name='LSTM_1'))(mask_input)
#audio_l2 = Bidirectional(LSTM(256, return_sequences=False, recurrent_dropout=0.5, name='LSTM_2'))(audio_l1)
audio_att = AttentionLayer()(audio_l1)
activation5 = Dropout(0.25)(audio_att)

final_prediction = Dense(5, activation='softmax')(activation5)
final_model = Model(inputs=audio_input, outputs=final_prediction)
adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
final_model.compile(loss='categorical_crossentropy',
                    optimizer=adam,
                    metrics=['accuracy'])

label = get_label(label_path)
data = get_mat_data(audio_path)
train_data, train_label, test_data, test_label_o = seperate_dataset(
    data, label)
test_label = to_categorical(test_label_o, num_classes=5)
train_label = to_categorical(train_label, num_classes=5)
예제 #8
0
    return res

###### Audio branch 2

# calculate left audio feature vector
left_input = Input(shape=(602, 64))
left_audio = Masking(mask_value=0.)(left_input)
left_audio = LSTM(256,
             return_sequences=True,
             recurrent_dropout=0.25,
             name='LSTM_left_audio_1')(left_audio)
left_audio = LSTM(128,
             return_sequences=True,
             recurrent_dropout=0.25,
             name='LSTM_left_audio_2')(left_audio)
left_audio_weight = AttentionLayer()(left_audio)
left_audio_weight = Lambda(weight_expand)(left_audio_weight)
left_audio_vector = Lambda(weight_dot)([left_audio, left_audio_weight])
left_audio_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(left_audio_vector)

# calculate right audio feature vector
right_input = Input(shape=(602, 64))
right_audio = Masking(mask_value = 0.)(right_input)
right_audio = LSTM(256,
              return_sequences=True,
              recurrent_dropout=0.25,
              name='LSTM_right_audio_1')(right_audio)
right_audio = LSTM(128,
              return_sequences=True,
              recurrent_dropout=0.25,
              name='LSTM_right_audio_2')(right_audio)
예제 #9
0
# LSTM layer
text = LSTM(512,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='LSTM_text_1')(text)

text = LSTM(256,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='LSTM_text_2')(text)

#text_l1 = BatchNormalization()(text_l1)

# attention layer
text_weight = AttentionLayer()(text)
text_weight = Lambda(weight_expand)(text_weight)
text_vector = Lambda(weight_dot)([text, text_weight])
text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(text_vector)

# dropout layer
dropout_text = Dropout(0.25)(text_feature_vector)
dense_text_1 = Dense(128, activation='relu')(dropout_text)
dropout_text = Dropout(0.25)(dense_text_1)

# decision-making
text_prediction = Dense(numclass, activation='softmax')(dropout_text)
text_model = Model(inputs=text_input, outputs=text_prediction)

text_model.load_weights(saving_path + 'entire_text_output_weights.h5')
text_model._make_predict_function()
               name='ph1_mask')(em_text)
# LSTM layer
text = LSTM(512,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='ph1_LSTM_text_1',
            trainable=phase_1_trainable)(text)
text = LSTM(256,
            return_sequences=True,
            recurrent_dropout=0.25,
            name='ph1_LSTM_text_2',
            trainable=phase_1_trainable)(text)
# batch normalization
#text_l1 = BatchNormalization(name=)(text_l1)
# attention layer
text_weight = AttentionLayer(name='ph1_att', trainable=phase_1_trainable)(text)
text_weight = Lambda(weight_expand,
                     name='ph1_lam1',
                     trainable=phase_1_trainable)(text_weight)
text_vector = Lambda(weight_dot, name='ph1_lam2',
                     trainable=phase_1_trainable)([text, text_weight])
text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1),
                             name='ph1_lam3',
                             trainable=phase_1_trainable)(text_vector)
# dropout layer
dropout_text = Dropout(0.25, name='ph1_drop1',
                       trainable=phase_1_trainable)(text_feature_vector)
dense_text_1 = Dense(128,
                     activation='relu',
                     name='ph1_dense',
                     trainable=phase_1_trainable)(dropout_text)
예제 #11
0
print('test_text shape:', test_text_data.shape)
print('train_label shape:', final_train_label.shape)
print('test_label shape:', test_label.shape)
"""

# Audio branch
frame_input = Input(shape=(513, 64))
mask_frame_input = Masking(mask_value=0.)(frame_input)
print('mask_frame_input shape: ', mask_frame_input.shape)
frame_l1 = Bidirectional(
    LSTM(128,
         return_sequences=True,
         recurrent_dropout=0.25,
         name='LSTM_audio_1'))(mask_frame_input)
print('frame_l1 shape: ', frame_l1.shape)
frame_att = AttentionLayer()(frame_l1)
print('frame_att shape: ', frame_att.shape)
dropout_frame = Dropout(0.5)(frame_att)
model_frame = Model(frame_input, dropout_frame)

word_input = Input(shape=(98, 513, 64))
mask_word_input = Masking(mask_value=0.)(word_input)
print('mask_word_input shape: ', mask_word_input.shape)
audio_input = TimeDistributed(model_frame)(mask_word_input)
print('audio_input shape: ', audio_input.shape)
audio_input = Masking(mask_value=0.)(audio_input)
audio_l1 = Bidirectional(
    LSTM(128,
         return_sequences=True,
         recurrent_dropout=0.25,
         name='LSTM_audio_2'))(audio_input)