def UttrAtten_AttenVec(atten): time_step = 62 # same as the number of frames within a chunk (i.e., m) feat_num = 130 # number of LLDs features chunk_num = 11 # number of chunks splitted for a sentence (i.e., C) # Input & LSTM Layer inputs = Input((time_step, feat_num)) encode = LSTM(units=feat_num, activation='tanh', dropout=0.5, return_sequences=True)(inputs) encode = LSTM(units=feat_num, activation='tanh', dropout=0.5, return_sequences=False)(encode) encode = BatchNormalization()(encode) # Uttr Attention Layer batch_atten_out = [] for uttr_idx in range(0, batch_size * chunk_num, chunk_num): _start = uttr_idx _end = uttr_idx + chunk_num encode_crop = crop(0, _start, _end)(encode) encode_crop = reshape()(encode_crop) atten_out = atten(encode_crop) batch_atten_out.append(atten_out) # Output-Layer concat_atten_out = Concatenate(axis=0)(batch_atten_out) outputs = output_net(feat_num)(concat_atten_out) outputs = repeat()(outputs) # for matching the input batch size model = Model(inputs=inputs, outputs=outputs) return model
def UttrAtten_GatedVec(atten): time_step = 62 # same as the number of frames within a chunk (i.e., m) feat_num = 130 # number of LLDs features chunk_num = 11 # number of chunks splitted for a sentence (i.e., C) # Input Layer inputs = Input((time_step, feat_num)) cnn_inputs = Permute((2, 1))(inputs) # cnn1: [128, 128] encode = Conv1D(filters=128, kernel_size=3, strides=1, dilation_rate=1, data_format='channels_first')(cnn_inputs) encode = BatchNormalization()(encode) encode = Activation('relu')(encode) encode = Conv1D(filters=128, kernel_size=3, strides=1, dilation_rate=1, data_format='channels_first')(encode) encode = BatchNormalization()(encode) encode = Activation('relu')(encode) # cnn2: [64, 64] encode = Conv1D(filters=64, kernel_size=3, strides=1, dilation_rate=1, data_format='channels_first')(encode) encode = BatchNormalization()(encode) encode = Activation('relu')(encode) encode = Conv1D(filters=64, kernel_size=3, strides=1, dilation_rate=1, data_format='channels_first')(encode) encode = BatchNormalization()(encode) encode = Activation('relu')(encode) # cnn3: [32] encode = Conv1D(filters=32, kernel_size=3, strides=2, dilation_rate=1, data_format='channels_first')(encode) encode = BatchNormalization()(encode) encode = Activation('relu')(encode) # cnn flatten output encode = Flatten()(encode) encode = Dense(units=feat_num, activation='relu')(encode) # Uttr Attention Layer batch_atten_out = [] for uttr_idx in range(0, batch_size*chunk_num, chunk_num): _start = uttr_idx _end = uttr_idx+chunk_num encode_crop = crop(0, _start, _end)(encode) encode_crop = reshape()(encode_crop) atten_weights = atten(encode_crop) atten_out = Multiply()([encode_crop, atten_weights]) atten_out = mean()(atten_out) batch_atten_out.append(atten_out) # Output-Layer concat_atten_out= Concatenate(axis=0)(batch_atten_out) outputs = output_net(feat_num)(concat_atten_out) outputs = repeat()(outputs) # for matching the input batch size model = Model(inputs=inputs, outputs=outputs) return model