def compute_per_example_generator_losses(features): y_mb_hat = generator(features['mel'], training=True) audios = features['audio'] y_hat = pqmf.synthesis(y_mb_hat) y_mb = pqmf.analysis(tf.expand_dims(audios, -1)) y_mb = tf.transpose(y_mb, (0, 2, 1)) y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1])) y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1)) y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1])) sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat, sub_band_stft_loss) sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]), -1) sub_mag_loss = tf.reduce_mean( tf.reshape(sub_mag_loss, [-1, pqmf.subbands]), -1) full_sc_loss, full_mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), full_band_stft_loss) generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss + full_mag_loss) p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn=mse_loss) adv_loss /= i + 1 generator_loss += 2.5 * adv_loss per_example_losses = generator_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'gen_loss': tf.reduce_mean(generator_loss), 'subband_spectral_convergence_loss': tf.reduce_mean(sub_sc_loss), 'subband_log_magnitude_loss': tf.reduce_mean(sub_mag_loss), 'fullband_spectral_convergence_loss': tf.reduce_mean(full_sc_loss), 'fullband_log_magnitude_loss': tf.reduce_mean(full_mag_loss), 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def compute_per_example_generator_losses(audios, outputs): y_hat = outputs p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += mse_loss(tf.ones_like(p_hat[i][-1]), p_hat[i][-1]) adv_loss /= i + 1 fm_loss = 0.0 for i in range(len(p_hat)): for j in range(len(p_hat[i]) - 1): fm_loss += mae_loss(p[i][j], p_hat[i][j]) fm_loss /= (i + 1) * (j + 1) adv_loss += 10 * fm_loss per_example_losses = adv_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'fm_loss': fm_loss, 'gen_loss': adv_loss, 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def compute_per_example_generator_losses(features): y_hat = generator(features['mel'], training = True) audios = features['audio'] sc_loss, mag_loss = calculate_2d_loss( audios, tf.squeeze(y_hat, -1), stft_loss ) sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss) mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss) generator_loss = 0.5 * (sc_loss + mag_loss) p_hat = discriminator(y_hat) p = discriminator(tf.expand_dims(audios, 2)) adv_loss = 0.0 for i in range(len(p_hat)): adv_loss += calculate_3d_loss( tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn = mse_loss ) adv_loss /= i + 1 fm_loss = 0.0 for i in range(len(p_hat)): for j in range(len(p_hat[i]) - 1): fm_loss += calculate_3d_loss( p[i][j], p_hat[i][j], loss_fn = mae_loss ) fm_loss /= (i + 1) * (j + 1) adv_loss += 10.0 * fm_loss generator_loss += 4.0 * adv_loss per_example_losses = generator_loss a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn = mels_loss) dict_metrics_losses = { 'adversarial_loss': adv_loss, 'fm_loss': fm_loss, 'gen_loss': tf.reduce_mean(generator_loss), 'mels_spectrogram_loss': tf.reduce_mean(a), } return per_example_losses, dict_metrics_losses
def model_fn(features, labels, mode, params): tacotron2_config = malaya_speech.config.tacotron2_config tacotron2_config['reduction_factor'] = reduction_factor c = tacotron2.Config( vocab_size = len(MALAYA_SPEECH_SYMBOLS) + 1, **tacotron2_config ) model = tacotron2.Model(c) input_ids = features['text_ids'] input_lengths = features['len_text_ids'][:, 0] speaker_ids = tf.constant([0], dtype = tf.int32) mel_outputs = features['mel'] mel_lengths = features['len_mel'][:, 0] mel_actuals = features['mel'] guided = features['g'] r = model( input_ids, input_lengths, speaker_ids, mel_outputs, mel_lengths, training = True, ) binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits = True) mae = tf.keras.losses.MeanAbsoluteError() decoder_output, post_mel_outputs, stop_token_predictions, alignment_histories = ( r ) mel_loss_before = calculate_3d_loss( mel_actuals, decoder_output, loss_fn = mae ) mel_loss_after = calculate_3d_loss( mel_actuals, post_mel_outputs, loss_fn = mae ) max_mel_length = tf.reduce_max(mel_lengths) stop_gts = tf.expand_dims( tf.range(tf.reduce_max(max_mel_length), dtype = tf.int32), 0 ) stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1]) stop_gts = tf.cast( tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)), tf.float32, ) stop_token_loss = calculate_2d_loss( stop_gts, stop_token_predictions, loss_fn = binary_crossentropy ) attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32) loss_att = tf.reduce_sum( tf.abs(alignment_histories * guided) * attention_masks, axis = [1, 2] ) loss_att /= tf.reduce_sum(attention_masks, axis = [1, 2]) loss_att = tf.reduce_mean(loss_att) loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att tf.identity(loss, 'loss') tf.identity(stop_token_loss, name = 'stop_token_loss') tf.identity(mel_loss_before, name = 'mel_loss_before') tf.identity(mel_loss_after, name = 'mel_loss_after') tf.identity(loss_att, name = 'loss_att') tf.summary.scalar('stop_token_loss', stop_token_loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('loss_att', loss_att) if mode == tf.estimator.ModeKeys.TRAIN: train_op = train.optimizer.adamw.create_optimizer( loss = loss, init_lr = learning_rate, num_train_steps = num_train_steps, num_warmup_steps = num_warmup_steps, end_learning_rate = end_learning_rate, weight_decay_rate = weight_decay_rate, ) estimator_spec = tf.estimator.EstimatorSpec( mode = mode, loss = loss, train_op = train_op ) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode = tf.estimator.ModeKeys.EVAL, loss = loss ) return estimator_spec
**mb_melgan_config['subband_stft_loss_params']) full_band_stft_loss = stft.loss.MultiResolutionSTFT( **mb_melgan_config['stft_loss_params']) y_mb_hat = generator(features['mel'], training=True) audios = features['audio'] y_hat = pqmf.synthesis(y_mb_hat) y_mb = pqmf.analysis(tf.expand_dims(audios, -1)) y_mb = tf.transpose(y_mb, (0, 2, 1)) y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1])) y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1)) y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1])) sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat, sub_band_stft_loss) sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]), -1) sub_mag_loss = tf.reduce_mean(tf.reshape(sub_mag_loss, [-1, pqmf.subbands]), -1) full_sc_loss, full_mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), full_band_stft_loss) generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss + full_mag_loss) generator_loss = tf.reduce_mean(generator_loss) g_optimizer = tf.train.AdamOptimizer(0.0001, beta1=0.5, beta2=0.9).minimize(generator_loss) sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer())
def model_fn(features, labels, mode, params): input_ids = features['text_ids'] input_lengths = features['len_text_ids'][:, 0] speaker_ids = tf.constant([0], dtype=tf.int32) mel_outputs = features['mel'] mel_lengths = features['len_mel'][:, 0] guided = features['g'] model = tacotron2.Model( [input_ids, input_lengths], [mel_outputs, mel_lengths], len(MALAYA_SPEECH_SYMBOLS), ) r = model.decoder_logits['outputs'] decoder_output, post_mel_outputs, alignment_histories, _, _, _ = r stop_token_predictions = model.decoder_logits['stop_token_prediction'] stop_token_predictions = stop_token_predictions[:, :, 0] binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits=True) mae = tf.keras.losses.MeanAbsoluteError() mel_loss_before = calculate_3d_loss(mel_outputs, decoder_output, loss_fn=mae) mel_loss_after = calculate_3d_loss(mel_outputs, post_mel_outputs, loss_fn=mae) max_mel_length = tf.reduce_max(mel_lengths) stop_gts = tf.expand_dims( tf.range(tf.reduce_max(max_mel_length), dtype=tf.int32), 0) stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1]) stop_gts = tf.cast( tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)), tf.float32, ) stop_token_loss = calculate_2d_loss(stop_gts, stop_token_predictions, loss_fn=binary_crossentropy) attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32) loss_att = tf.reduce_sum(tf.abs(alignment_histories * guided) * attention_masks, axis=[1, 2]) loss_att /= tf.reduce_sum(attention_masks, axis=[1, 2]) loss_att = tf.reduce_mean(loss_att) loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att tf.identity(loss, 'loss') tf.identity(stop_token_loss, name='stop_token_loss') tf.identity(mel_loss_before, name='mel_loss_before') tf.identity(mel_loss_after, name='mel_loss_after') tf.identity(loss_att, name='loss_att') tf.summary.scalar('stop_token_loss', stop_token_loss) tf.summary.scalar('mel_loss_before', mel_loss_before) tf.summary.scalar('mel_loss_after', mel_loss_after) tf.summary.scalar('loss_att', loss_att) if mode == tf.estimator.ModeKeys.TRAIN: train_op = train.optimizer.optimize_loss( loss, tf.train.AdamOptimizer, parameters['optimizer_params'], learning_rate_scheduler, summaries=[ 'learning_rate', 'variables', 'gradients', 'larc_summaries', 'variable_norm', 'gradient_norm', 'global_gradient_norm', ], larc_params=parameters.get('larc_params', None), loss_scaling=parameters.get('loss_scaling', 1.0), loss_scaling_params=parameters.get('loss_scaling_params', None), clip_gradients=parameters.get('max_grad_norm', None), ) estimator_spec = tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op) elif mode == tf.estimator.ModeKeys.EVAL: estimator_spec = tf.estimator.EstimatorSpec( mode=tf.estimator.ModeKeys.EVAL, loss=loss) return estimator_spec
from malaya_speech.train.model import stft import malaya_speech.config from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss hifigan_config = malaya_speech.config.hifigan_config generator = hifigan.Generator( hifigan.GeneratorConfig(**hifigan_config['hifigan_generator_params']), name='hifigan_generator', ) stft_loss = stft.loss.MultiResolutionSTFT(**hifigan_config['stft_loss_params']) y_hat = generator(features['mel'], training=True) audios = features['audio'] sc_loss, mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), stft_loss) sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss) mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss) generator_loss = 0.5 * (sc_loss + mag_loss) generator_loss = tf.reduce_mean(generator_loss) global_step = tf.train.get_or_create_global_step() print(global_step) g_boundaries = [100_000, 200_000, 300_000, 400_000, 500_000, 600_000, 700_000] g_values = [ 0.0005, 0.0005,