def build(self, inp, inp_mask): batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] ### Encoder [ begin ] with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable( 'style_token', (1, styles_kind, style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope("pre-net"): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=self.training) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=self.training) with tf.variable_scope("CBHG"): # batch major encoder_output = CBHG(16, (128, 128))(pre_ed_inp, sequence_length=inp_mask, is_training=self.training, time_major=False) with tf.variable_scope("attention"): att_module = AttentionModule(ATT_RNN_SIZE, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE, self.style_token, time_major=False) with tf.variable_scope("decoder"): with tf.variable_scope("attentionRnn"): att_cell = GRUCell(ATT_RNN_SIZE) with tf.variable_scope("acoustic_module"): aco_cell = MultiRNNCell( [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)]) ### prepare output alpha TensorArray reduced_time_steps = tf.div(MAX_OUT_STEPS, self.r) att_cell_state = att_cell.init_state(batch_size, tf.float32) aco_cell_state = aco_cell.zero_state(batch_size, tf.float32) state_tup = tuple([att_cell_state, aco_cell_state]) output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) init_indic = tf.zeros([batch_size, OUTPUT_MEL_DIM]) # init_context = tf.zeros((batch_size, 256)) time = tf.constant(0, dtype=tf.int32) cond = lambda time, *_: tf.less(time, reduced_time_steps) def body(time, indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup): with tf.variable_scope("att-rnn"): pre_ed_indic = tf.layers.dropout(tf.layers.dense( indic, 256, tf.nn.relu), training=self.training) pre_ed_indic = tf.layers.dropout(tf.layers.dense( pre_ed_indic, 128, tf.nn.relu), training=self.training) att_cell_out, att_cell_state = att_cell( tf.concat([pre_ed_indic], axis=-1), state_tup[0]) with tf.variable_scope("attention"): query = att_cell_state[0] # att_cell_out context, alpha = att_module(query) alpha_ta = alpha_ta.write(time, alpha) with tf.variable_scope("attention_style"): context_style, alpha_style = att_module_style(query) print('context_style:', context_style) # print('context_style22:', alpha_style) alpha_style_ta = alpha_style_ta.write(time, alpha_style) with tf.variable_scope("weighting"): weighting = add_layer(query, query.shape[-1], 1, 'weighting_w', 'weighting_b', activation_function=tf.nn.sigmoid) # weighting = tf.nn.softmax(weighting) weight_ta = weight_ta.write(time, weighting) with tf.variable_scope("acoustic_module"): # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1)) # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1)) # weighting_context = weighting0 * context + weighting1 * context_style # print('context:', context) weighting = tf.Print(weighting, [weighting], message='weight', summarize=100) context_style = tf.Print(context_style, [context_style[0][0:5]], message='origal_style', summarize=100) context_style = tf.Print( context_style, [tf.nn.tanh(context_style)[0][0:5]], message='tanh_style', summarize=100) context = tf.Print(context, [context[0][0:5]], message='context', summarize=100) weighting_context = context + weighting * tf.nn.tanh( context_style) aco_input = tf.layers.dense( tf.concat([att_cell_out, weighting_context], axis=-1), DEC_RNN_SIZE) aco_cell_out, aco_cell_state = aco_cell( aco_input, state_tup[1]) dense_out = tf.reshape( tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r), shape=(batch_size, self.r, OUTPUT_MEL_DIM)) output_ta = output_ta.write(time, dense_out) new_indic = dense_out[:, -1] state_tup = tuple([att_cell_state, aco_cell_state]) return tf.add( time, 1 ), new_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ### run loop _, _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop( cond, body, [ time, init_indic, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ]) ### time major with tf.variable_scope("output"): output_mel = tf.reshape(output_mel_ta.stack(), shape=(reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r)) output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)), shape=(batch_size, MAX_OUT_STEPS, OUTPUT_MEL_DIM)) self.out_mel = output_mel with tf.variable_scope("post-net"): output_post = CBHG(8, (256, OUTPUT_MEL_DIM))( output_mel, sequence_length=None, is_training=self.training, time_major=False) output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM) self.out_stftm = output_spec final_alpha = tf.reshape(final_alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) self.final_alpha = tf.transpose(final_alpha, perm=(1, 0, 2)) # batch major final_alpha_style = tf.reshape(final_alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, styles_kind)) self.final_alpha_style = tf.transpose(final_alpha_style, perm=(1, 0, 2)) # batch major final_weight_ta = tf.reshape(final_weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) self.final_weight_ta = tf.transpose(final_weight_ta, perm=(1, 0, 2)) # batch major
def build(self, inp, inp_mask, mel_gtruth, spec_gtruth): batch_size = tf.shape(inp)[0] input_time_steps = tf.shape(inp)[1] output_time_steps = tf.shape(mel_gtruth)[1] ### Encoder [ begin with tf.variable_scope("encoder"): with tf.variable_scope("embedding"): embed_inp = EmbeddingLayer(EMBED_CLASS, EMBED_DIM)(inp) with tf.variable_scope("changeToVarible"): self.single_style_token = tf.get_variable( 'style_token', (1, styles_kind, style_dim), dtype=tf.float32) self.style_token = tf.tile(self.single_style_token, (batch_size, 1, 1)) with tf.variable_scope("pre-net"): pre_ed_inp = tf.layers.dropout(tf.layers.dense( embed_inp, 256, tf.nn.relu), training=self.training) pre_ed_inp = tf.layers.dropout(tf.layers.dense( pre_ed_inp, 128, tf.nn.relu), training=self.training) with tf.variable_scope("CBHG"): # batch major encoder_output = CBHG(16, (128, 128))(pre_ed_inp, sequence_length=inp_mask, is_training=self.training, time_major=False) with tf.variable_scope("attention"): att_module = AttentionModule(ATT_RNN_SIZE, encoder_output, sequence_length=inp_mask, time_major=False) with tf.variable_scope("attention_style"): att_module_style = AttentionModule(STYLE_ATT_RNN_SIZE, self.style_token, time_major=False) with tf.variable_scope("decoder"): with tf.variable_scope("attentionRnn"): att_cell = GRUCell(ATT_RNN_SIZE) with tf.variable_scope("acoustic_module"): aco_cell = MultiRNNCell( [ResidualWrapper(GRUCell(DEC_RNN_SIZE)) for _ in range(2)]) ### prepare output alpha TensorArray reduced_time_steps = tf.div(output_time_steps, self.r) att_cell_state = att_cell.init_state(batch_size, tf.float32) aco_cell_state = aco_cell.zero_state(batch_size, tf.float32) state_tup = tuple([att_cell_state, aco_cell_state]) output_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) weight_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) alpha_style_ta = tf.TensorArray(size=reduced_time_steps, dtype=tf.float32) indic_ta = tf.TensorArray(size=self.r + output_time_steps, dtype=tf.float32) time_major_mel_gtruth = tf.transpose(mel_gtruth, perm=(1, 0, 2)) indic_array = tf.concat([ tf.zeros([self.r, batch_size, OUTPUT_MEL_DIM]), time_major_mel_gtruth ], axis=0) indic_ta = indic_ta.unstack(indic_array) #init_context = tf.zeros((batch_size, 256)) time = tf.constant(0, dtype=tf.int32) cond = lambda time, *_: tf.less(time, reduced_time_steps) def body(time, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup): with tf.variable_scope("att-rnn"): pre_ed_indic = tf.layers.dropout(tf.layers.dense( indic_ta.read(self.r * time + self.r - 1), 256, tf.nn.relu), training=self.training) pre_ed_indic = tf.layers.dropout(tf.layers.dense( pre_ed_indic, 128, tf.nn.relu), training=self.training) att_cell_out, att_cell_state = att_cell( tf.concat([pre_ed_indic], axis=-1), state_tup[0]) with tf.variable_scope("attention"): query = att_cell_state[0] # att_cell_out context, alpha = att_module(query) alpha_ta = alpha_ta.write(time, alpha) with tf.variable_scope("attention_style"): context_style, alpha_style = att_module_style(query) alpha_style_ta = alpha_style_ta.write(time, alpha_style) with tf.variable_scope("weighting"): print(query) weighting = add_layer(query, query.shape[-1], 1, 'weighting_w', 'weighting_b', activation_function=tf.nn.sigmoid) # weighting = tf.nn.softmax(weighting) weight_ta = weight_ta.write(time, weighting) with tf.variable_scope("acoustic_module"): # weighting0 = tf.reshape(weighting[:, 0], (BATCH_SIZE, 1)) # weighting1 = tf.reshape(weighting[:, 1], (BATCH_SIZE, 1)) weighting_context = context + weighting * tf.nn.tanh( context_style) # print(weighting_context) aco_input = tf.layers.dense( tf.concat([att_cell_out, weighting_context], axis=-1), DEC_RNN_SIZE) aco_cell_out, aco_cell_state = aco_cell( aco_input, state_tup[1]) dense_out = tf.layers.dense(aco_cell_out, OUTPUT_MEL_DIM * self.r) output_ta = output_ta.write(time, dense_out) state_tup = tuple([att_cell_state, aco_cell_state]) return tf.add( time, 1 ), output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ### run loop _, output_mel_ta, final_alpha_ta, final_alpha_style_ta, final_weight_ta, *_ = tf.while_loop( cond, body, [ time, output_ta, alpha_ta, alpha_style_ta, weight_ta, state_tup ]) # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, # OUTPUT_MEL_DIM) # sys.stdout.flush() ### time major with tf.variable_scope("output"): # print('hjhhhh', reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r, batch_size, output_time_steps, OUTPUT_MEL_DIM) # sys.stdout.flush() output_mel = tf.reshape(output_mel_ta.stack(), shape=(reduced_time_steps, batch_size, OUTPUT_MEL_DIM * self.r)) output_mel = tf.reshape(tf.transpose(output_mel, perm=(1, 0, 2)), shape=(batch_size, output_time_steps, OUTPUT_MEL_DIM)) self.out_mel = output_mel with tf.variable_scope("post-net"): output_post = CBHG(8, (256, OUTPUT_MEL_DIM))( output_mel, sequence_length=None, is_training=self.training, time_major=False) output_spec = tf.layers.dense(output_post, OUTPUT_SPEC_DIM) self.out_stftm = output_spec final_alpha = tf.reshape(final_alpha_ta.stack(), shape=(reduced_time_steps, batch_size, input_time_steps)) final_alpha = tf.transpose(final_alpha, perm=(1, 0, 2)) # batch major final_alpha_style = tf.reshape(final_alpha_style_ta.stack(), shape=(reduced_time_steps, batch_size, styles_kind)) final_alpha_style = tf.transpose(final_alpha_style, perm=(1, 0, 2)) # batch major final_weight_ta = tf.reshape(final_weight_ta.stack(), shape=(reduced_time_steps, batch_size, 1)) final_weight_ta = tf.transpose(final_weight_ta, perm=(1, 0, 2)) # batch major self.weighting = final_weight_ta # self.alpha_style_hjk_img = tf.reshape(final_alpha_style, shape=(batch_size, reduced_time_steps, styles_kind)) with tf.variable_scope("loss_and_metric"): self.loss_mel = tf.reduce_mean(tf.abs(mel_gtruth - output_mel)) self.loss_spec = tf.reduce_mean(tf.abs(spec_gtruth - output_spec)) self.loss = self.loss_mel + self.loss_spec self.alpha_img = tf.expand_dims(final_alpha, -1) self.alpha_style_img = tf.expand_dims(final_alpha_style, -1) self.weight_img = tf.expand_dims(final_weight_ta, -1) self.sums = [] self.sums.append( tf.summary.image("train/alpha", self.alpha_img[:2])) self.sums.append( tf.summary.image("train/alpha_style", self.alpha_style_img[:2])) self.sums.append( tf.summary.image("train/weight", self.weight_img[:2])) self.sums.append(tf.summary.scalar("train/loss", self.loss)) self.sums.append( tf.summary.scalar("train/style_0_0", self.single_style_token[0][0][0])) self.sums.append( tf.summary.scalar("train/style_0_100", self.single_style_token[0][0][100])) self.sums.append( tf.summary.scalar("train/style_5_100", self.single_style_token[0][5][100])) self.sums.append( tf.summary.histogram("train/style_vec", self.single_style_token)) self.pred_audio_holder = tf.placeholder(shape=(None, None), dtype=tf.float32, name='pred_audio') self.pred_audio_summary = tf.summary.audio('pred_audio_summary', self.pred_audio_holder, sample_rate=sr, max_outputs=12)