final_train_audio = np.array(final_train_audio) print('train_audio shape:', final_train_audio.shape) print('train_text shape:', final_train_text.shape) print('test_audio shape:', test_audio_data.shape) print('test_text shape:', test_text_data.shape) print('train_label shape:', final_train_label.shape) print('test_label shape:', test_label.shape) """ # Audio branch audio_input = Input(shape=(2250, 64)) mask_audio_input = Masking(mask_value=0.)(audio_input) audio_l1 = Bidirectional( LSTM(128, return_sequences=True, recurrent_dropout=0.25, name='LSTM_audio'))(mask_audio_input) audio_att = AttentionLayer()(audio_l1) dropout_audio = Dropout(0.5)(audio_att) audio_prediction = Dense(5, activation='softmax')(dropout_audio) audio_model = Model(inputs=audio_input, outputs=audio_prediction) inter_audio_model = Model(inputs=audio_input, outputs=audio_att) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) audio_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) # Text Branch text_input = Input(shape=(50, )) em_text = Embedding(len(dic) + 1, 200, weights=[embed_matrix], trainable=True)(text_input)
# LSTM layer audio = LSTM(512, return_sequences=True, recurrent_dropout=0.25, name='LSTM_audio_1')(audio) audio = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='LSTM_audio_2')(audio) #frame_l1 = BatchNormalization()(frame_l1) # attention layer audio_weight = AttentionLayer()(audio) audio_weight = Lambda(weight_expand)(audio_weight) audio_vector = Lambda(weight_dot)([audio, audio_weight]) audio_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(audio_vector) # dropout layer dropout_audio = Dropout(0.5)(audio_feature_vector) dense_audio_1 = Dense(128, activation='relu')(dropout_audio) dropout_audio = Dropout(0.5)(dense_audio_1) # decision-making audio_prediction = Dense(numclass, activation='softmax')(dropout_audio) audio_model = Model(inputs=[left_input, right_input], outputs=audio_prediction) inter_audio = Model(inputs=[left_input, right_input], outputs=audio_feature_vector)
def weight_dot(inputs): x = inputs[0] y = inputs[1] return x * y # Contextual branch context_input = Input(shape=(537, 64)) context_input = Masking(mask_value=0.)(context_input) context_l1 = Bidirectional( LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='contextual_LSTM'))(context_input) context_weight = AttentionLayer()(context_l1) context_weight_exp = Lambda(weight_expand)(context_weight) context_attention = Lambda(weight_dot)([context_l1, context_weight_exp]) context_att = Lambda(lambda x: backend.sum(x, axis=1))(context_attention) dropout_context = Dropout(0.25)(context_att) # Original Branch ori_input = Input(shape=(537, 64)) ori_input = Masking(mask_value=0.)(ori_input) ori_l1 = Bidirectional( LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='contextual_LSTM'))(ori_input) ori_weight = AttentionLayer()(ori_l1) ori_weight_exp = Lambda(weight_expand)(ori_weight)
def hierarchical_attention( max_seq, emb_weights=None, embedding_size=None, vocab_size=None, # embedding recursive_class=GRU, word_rnnsize=100, # rnn drop_wordemb=0.2, drop_wordrnnout=0.2): """ Creates a model based on the Hierarchical Attention model according to : https://arxiv.org/abs/1606.02393 inputs: maxSeq : max size for sentences embedding embWeights : numpy matrix with embedding values embeddingSize (if embWeights is None) : embedding size vocabSize (if embWeights is None) : vocabulary size Recursive Layers recursiveClass : class for recursive class. Default is GRU wordRnnSize : RNN size for word sequence sentenceRnnSize : RNN size for sentence sequence Dense Layers wordDenseSize: dense layer at exit from RNN , on sentence at word level sentenceHiddenSize : dense layer at exit from RNN , on document at sentence level Dropout returns : Two models. They are the same, but the second contains multiple outputs that can be use to analyse attention. """ # Sentence level logic # Input Layer words_inputs = Input(shape=(max_seq, ), dtype='int32', name='words_input') # Word embedding layer if emb_weights is None: emb = Embedding(vocab_size, embedding_size, mask_zero=True)(words_inputs) else: emb = Embedding(emb_weights.shape[0], emb_weights.shape[1], mask_zero=True, weights=[emb_weights], trainable=False)(words_inputs) """ if drop_wordemb != 0.0: emb = Dropout(drop_wordemb)(emb) """ # RNN layer (GRU/LSTM/biLSTM) word_rnn = Bidirectional(recursive_class(word_rnnsize, return_sequences=True), merge_mode='concat')(emb) # word_rnn = BatchNormalization()(word_rnn) if drop_wordrnnout > 0.0: word_rnn = Dropout(drop_wordrnnout)(word_rnn) sentence_att = AttentionLayer()(word_rnn) sentence_out = Dense(6, activation="softmax", name="words_Out")(sentence_att) model = Model(words_inputs, sentence_out) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) """ documentInputs = Input(shape=(None, maxSeq), dtype='int32', name='document_input') sentenceMasking = Masking(mask_value=0)(documentInputs) sentenceEmbbeding = TimeDistributed(modelSentence)(sentenceMasking) sentenceAttention = TimeDistributed(modelSentAttention)(sentenceMasking) sentenceRnn = Bidirectional(recursiveClass(wordRnnSize, return_sequences=True), merge_mode='concat')( sentenceEmbbeding) if dropSentenceRnnOut > 0.0: sentenceRnn = Dropout(dropSentenceRnnOut)(sentenceRnn) attentionSent = AttentionLayer()(sentenceRnn) documentEmb = merge([sentenceRnn, attentionSent], mode=lambda x: x[1] * x[0], output_shape=lambda x: x[0]) documentEmb = Lambda(lambda x: K.sum(x, axis=1), output_shape=lambda x: (x[0], x[2]), name="att2")(documentEmb) documentOut = Dense(1, activation="sigmoid", name="documentOut")(documentEmb) model = Model(input=[documentInputs], output=[documentOut]) model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) modelAttentionEv = Model(input=[documentInputs], output=[documentOut, sentenceAttention, attentionSent]) modelAttentionEv.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy']) """ return model
def data_normal(x): min_max_scaler = preprocessing.MinMaxScaler() x = min_max_scaler.fit_transform(x) return x # Text Branch text_input = Input(shape=(98, )) em_text = Embedding(len(dic) + 1, 200, weights=[embed_matrix], trainable=True)(text_input) mask_text_input = Masking(mask_value=0.)(em_text) text_l1 = Bidirectional( LSTM(100, return_sequences=True, recurrent_dropout=0.25, name='LSTM_text'))(mask_text_input) text_l1 = BatchNormalization()(text_l1) text_weight = AttentionLayer()(text_l1) text_weight = BatchNormalization()(text_weight) # text_weight_exp = Lambda(weight_expand)(text_weight) # text_attention = Lambda(weight_dot)([text_l1, text_weight_exp]) # text_att = Lambda(lambda x: backend.sum(x, axis=1))(text_attention) # dropout_text = Dropout(0.5)(text_att) text_prediction = Dense(numclass, activation='softmax')(dropout_text) text_model = Model(inputs=text_input, outputs=text_prediction) inter_text_hidden = Model(inputs=text_input, outputs=[text_attention, text_weight]) inter_text_weight = Model(inputs=text_input, outputs=text_weight) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) text_model.compile(loss='categorical_crossentropy', optimizer=adam,
trainable=True)(text_input) # masking layer text = Masking(mask_value=0., name='ph1_mask')(em_text) # LSTM layer text = LSTM(512, return_sequences=True, recurrent_dropout=0.25, name='ph1_LSTM_text_1')(text) text = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='ph1_LSTM_text_2')(text) # batch normalization #text_l1 = BatchNormalization(name=)(text_l1) # attention layer text_weight = AttentionLayer(name='ph1_att')(text) text_weight = Lambda(weight_expand, name='ph1_lam1')(text_weight) text_vector = Lambda(weight_dot, name='ph1_lam2')([text, text_weight]) text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1), name='ph1_lam3')(text_vector) # dropout layer dropout_text = Dropout(0.25, name='ph1_drop1')(text_feature_vector) dense_text_1 = Dense(128, activation='relu', name='ph1_dense')(dropout_text) dropout_text = Dropout(0.25, name='ph1_drop2')(dense_text_1) # decision-making text_prediction = Dense(numclass, activation='softmax', name='ph1_dec')(dropout_text) text_model = Model(inputs=text_input, outputs=text_prediction, name='ph1_model') #inter_text = Model(inputs = text_input, outputs = text_feature_vector)
r_3[str(result[i])] += 1 elif test_label[i] == 4: r_4[str(result[i])] += 1 i += 1 return r_0, r_1, r_2, r_3, r_4 # Audio BLSTM audio_input = Input(shape=(2250, 64)) mask_input = Masking(mask_value=0.)(audio_input) audio_l1 = Bidirectional( LSTM(100, return_sequences=True, recurrent_dropout=0.25, name='LSTM_1'))(mask_input) #audio_l2 = Bidirectional(LSTM(256, return_sequences=False, recurrent_dropout=0.5, name='LSTM_2'))(audio_l1) audio_att = AttentionLayer()(audio_l1) activation5 = Dropout(0.25)(audio_att) final_prediction = Dense(5, activation='softmax')(activation5) final_model = Model(inputs=audio_input, outputs=final_prediction) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08) final_model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy']) label = get_label(label_path) data = get_mat_data(audio_path) train_data, train_label, test_data, test_label_o = seperate_dataset( data, label) test_label = to_categorical(test_label_o, num_classes=5) train_label = to_categorical(train_label, num_classes=5)
return res ###### Audio branch 2 # calculate left audio feature vector left_input = Input(shape=(602, 64)) left_audio = Masking(mask_value=0.)(left_input) left_audio = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='LSTM_left_audio_1')(left_audio) left_audio = LSTM(128, return_sequences=True, recurrent_dropout=0.25, name='LSTM_left_audio_2')(left_audio) left_audio_weight = AttentionLayer()(left_audio) left_audio_weight = Lambda(weight_expand)(left_audio_weight) left_audio_vector = Lambda(weight_dot)([left_audio, left_audio_weight]) left_audio_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(left_audio_vector) # calculate right audio feature vector right_input = Input(shape=(602, 64)) right_audio = Masking(mask_value = 0.)(right_input) right_audio = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='LSTM_right_audio_1')(right_audio) right_audio = LSTM(128, return_sequences=True, recurrent_dropout=0.25, name='LSTM_right_audio_2')(right_audio)
# LSTM layer text = LSTM(512, return_sequences=True, recurrent_dropout=0.25, name='LSTM_text_1')(text) text = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='LSTM_text_2')(text) #text_l1 = BatchNormalization()(text_l1) # attention layer text_weight = AttentionLayer()(text) text_weight = Lambda(weight_expand)(text_weight) text_vector = Lambda(weight_dot)([text, text_weight]) text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1))(text_vector) # dropout layer dropout_text = Dropout(0.25)(text_feature_vector) dense_text_1 = Dense(128, activation='relu')(dropout_text) dropout_text = Dropout(0.25)(dense_text_1) # decision-making text_prediction = Dense(numclass, activation='softmax')(dropout_text) text_model = Model(inputs=text_input, outputs=text_prediction) text_model.load_weights(saving_path + 'entire_text_output_weights.h5') text_model._make_predict_function()
name='ph1_mask')(em_text) # LSTM layer text = LSTM(512, return_sequences=True, recurrent_dropout=0.25, name='ph1_LSTM_text_1', trainable=phase_1_trainable)(text) text = LSTM(256, return_sequences=True, recurrent_dropout=0.25, name='ph1_LSTM_text_2', trainable=phase_1_trainable)(text) # batch normalization #text_l1 = BatchNormalization(name=)(text_l1) # attention layer text_weight = AttentionLayer(name='ph1_att', trainable=phase_1_trainable)(text) text_weight = Lambda(weight_expand, name='ph1_lam1', trainable=phase_1_trainable)(text_weight) text_vector = Lambda(weight_dot, name='ph1_lam2', trainable=phase_1_trainable)([text, text_weight]) text_feature_vector = Lambda(lambda x: backend.sum(x, axis=1), name='ph1_lam3', trainable=phase_1_trainable)(text_vector) # dropout layer dropout_text = Dropout(0.25, name='ph1_drop1', trainable=phase_1_trainable)(text_feature_vector) dense_text_1 = Dense(128, activation='relu', name='ph1_dense', trainable=phase_1_trainable)(dropout_text)
print('test_text shape:', test_text_data.shape) print('train_label shape:', final_train_label.shape) print('test_label shape:', test_label.shape) """ # Audio branch frame_input = Input(shape=(513, 64)) mask_frame_input = Masking(mask_value=0.)(frame_input) print('mask_frame_input shape: ', mask_frame_input.shape) frame_l1 = Bidirectional( LSTM(128, return_sequences=True, recurrent_dropout=0.25, name='LSTM_audio_1'))(mask_frame_input) print('frame_l1 shape: ', frame_l1.shape) frame_att = AttentionLayer()(frame_l1) print('frame_att shape: ', frame_att.shape) dropout_frame = Dropout(0.5)(frame_att) model_frame = Model(frame_input, dropout_frame) word_input = Input(shape=(98, 513, 64)) mask_word_input = Masking(mask_value=0.)(word_input) print('mask_word_input shape: ', mask_word_input.shape) audio_input = TimeDistributed(model_frame)(mask_word_input) print('audio_input shape: ', audio_input.shape) audio_input = Masking(mask_value=0.)(audio_input) audio_l1 = Bidirectional( LSTM(128, return_sequences=True, recurrent_dropout=0.25, name='LSTM_audio_2'))(audio_input)