Esempi in Python per calculate_2d_loss, esempi in Python per malaya_speech.train.loss.calculate_2d_loss

Esempio n. 1

0

Mostra file

File: mbmelgan-male.py Progetto: xzm2004260/malaya-speech

def compute_per_example_generator_losses(features):
    y_mb_hat = generator(features['mel'], training=True)
    audios = features['audio']
    y_hat = pqmf.synthesis(y_mb_hat)

    y_mb = pqmf.analysis(tf.expand_dims(audios, -1))
    y_mb = tf.transpose(y_mb, (0, 2, 1))
    y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1]))

    y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1))
    y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1]))
    sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat,
                                                  sub_band_stft_loss)

    sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]),
                                 -1)
    sub_mag_loss = tf.reduce_mean(
        tf.reshape(sub_mag_loss, [-1, pqmf.subbands]), -1)
    full_sc_loss, full_mag_loss = calculate_2d_loss(audios,
                                                    tf.squeeze(y_hat, -1),
                                                    full_band_stft_loss)

    generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss +
                                                                 full_mag_loss)

    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]),
                                      p_hat[i][-1],
                                      loss_fn=mse_loss)
    adv_loss /= i + 1

    generator_loss += 2.5 * adv_loss

    per_example_losses = generator_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'gen_loss': tf.reduce_mean(generator_loss),
        'subband_spectral_convergence_loss': tf.reduce_mean(sub_sc_loss),
        'subband_log_magnitude_loss': tf.reduce_mean(sub_mag_loss),
        'fullband_spectral_convergence_loss': tf.reduce_mean(full_sc_loss),
        'fullband_log_magnitude_loss': tf.reduce_mean(full_mag_loss),
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses

Esempio n. 2

0

Mostra file

File: universal-melgan.py Progetto: xzm2004260/malaya-speech

def compute_per_example_generator_losses(audios, outputs):
    y_hat = outputs
    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += mse_loss(tf.ones_like(p_hat[i][-1]), p_hat[i][-1])
    adv_loss /= i + 1

    fm_loss = 0.0
    for i in range(len(p_hat)):
        for j in range(len(p_hat[i]) - 1):
            fm_loss += mae_loss(p[i][j], p_hat[i][j])
    fm_loss /= (i + 1) * (j + 1)
    adv_loss += 10 * fm_loss

    per_example_losses = adv_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'fm_loss': fm_loss,
        'gen_loss': adv_loss,
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses

Esempio n. 3

0

Mostra file

def compute_per_example_generator_losses(features):
    y_hat = generator(features['mel'], training = True)
    audios = features['audio']

    sc_loss, mag_loss = calculate_2d_loss(
        audios, tf.squeeze(y_hat, -1), stft_loss
    )

    sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss)
    mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss)

    generator_loss = 0.5 * (sc_loss + mag_loss)

    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += calculate_3d_loss(
            tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn = mse_loss
        )
    adv_loss /= i + 1

    fm_loss = 0.0
    for i in range(len(p_hat)):
        for j in range(len(p_hat[i]) - 1):
            fm_loss += calculate_3d_loss(
                p[i][j], p_hat[i][j], loss_fn = mae_loss
            )

    fm_loss /= (i + 1) * (j + 1)
    adv_loss += 10.0 * fm_loss
    generator_loss += 4.0 * adv_loss

    per_example_losses = generator_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn = mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'fm_loss': fm_loss,
        'gen_loss': tf.reduce_mean(generator_loss),
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses

Esempio n. 4

0

Mostra file

File: tacotron2-female-3.py Progetto: xzm2004260/malaya-speech

def model_fn(features, labels, mode, params):
    tacotron2_config = malaya_speech.config.tacotron2_config
    tacotron2_config['reduction_factor'] = reduction_factor
    c = tacotron2.Config(
        vocab_size = len(MALAYA_SPEECH_SYMBOLS) + 1, **tacotron2_config
    )
    model = tacotron2.Model(c)
    input_ids = features['text_ids']
    input_lengths = features['len_text_ids'][:, 0]
    speaker_ids = tf.constant([0], dtype = tf.int32)
    mel_outputs = features['mel']
    mel_lengths = features['len_mel'][:, 0]
    mel_actuals = features['mel']
    guided = features['g']
    r = model(
        input_ids,
        input_lengths,
        speaker_ids,
        mel_outputs,
        mel_lengths,
        training = True,
    )

    binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits = True)
    mae = tf.keras.losses.MeanAbsoluteError()

    decoder_output, post_mel_outputs, stop_token_predictions, alignment_histories = (
        r
    )
    mel_loss_before = calculate_3d_loss(
        mel_actuals, decoder_output, loss_fn = mae
    )
    mel_loss_after = calculate_3d_loss(
        mel_actuals, post_mel_outputs, loss_fn = mae
    )
    max_mel_length = tf.reduce_max(mel_lengths)
    stop_gts = tf.expand_dims(
        tf.range(tf.reduce_max(max_mel_length), dtype = tf.int32), 0
    )
    stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1])
    stop_gts = tf.cast(
        tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)),
        tf.float32,
    )
    stop_token_loss = calculate_2d_loss(
        stop_gts, stop_token_predictions, loss_fn = binary_crossentropy
    )
    attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32)
    loss_att = tf.reduce_sum(
        tf.abs(alignment_histories * guided) * attention_masks, axis = [1, 2]
    )
    loss_att /= tf.reduce_sum(attention_masks, axis = [1, 2])
    loss_att = tf.reduce_mean(loss_att)

    loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att

    tf.identity(loss, 'loss')
    tf.identity(stop_token_loss, name = 'stop_token_loss')
    tf.identity(mel_loss_before, name = 'mel_loss_before')
    tf.identity(mel_loss_after, name = 'mel_loss_after')
    tf.identity(loss_att, name = 'loss_att')

    tf.summary.scalar('stop_token_loss', stop_token_loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('loss_att', loss_att)

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.adamw.create_optimizer(
            loss = loss,
            init_lr = learning_rate,
            num_train_steps = num_train_steps,
            num_warmup_steps = num_warmup_steps,
            end_learning_rate = end_learning_rate,
            weight_decay_rate = weight_decay_rate,
        )
        estimator_spec = tf.estimator.EstimatorSpec(
            mode = mode, loss = loss, train_op = train_op
        )

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode = tf.estimator.ModeKeys.EVAL, loss = loss
        )

    return estimator_spec

Esempio n. 5

0

Mostra file

    **mb_melgan_config['subband_stft_loss_params'])

full_band_stft_loss = stft.loss.MultiResolutionSTFT(
    **mb_melgan_config['stft_loss_params'])

y_mb_hat = generator(features['mel'], training=True)
audios = features['audio']
y_hat = pqmf.synthesis(y_mb_hat)

y_mb = pqmf.analysis(tf.expand_dims(audios, -1))
y_mb = tf.transpose(y_mb, (0, 2, 1))
y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1]))

y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1))
y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1]))
sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat,
                                              sub_band_stft_loss)

sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]), -1)
sub_mag_loss = tf.reduce_mean(tf.reshape(sub_mag_loss, [-1, pqmf.subbands]),
                              -1)
full_sc_loss, full_mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1),
                                                full_band_stft_loss)

generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss +
                                                             full_mag_loss)
generator_loss = tf.reduce_mean(generator_loss)
g_optimizer = tf.train.AdamOptimizer(0.0001, beta1=0.5,
                                     beta2=0.9).minimize(generator_loss)

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

Esempio n. 6

0

Mostra file

File: tacotron2-nvidia-male.py Progetto: xzm2004260/malaya-speech

def model_fn(features, labels, mode, params):
    input_ids = features['text_ids']
    input_lengths = features['len_text_ids'][:, 0]
    speaker_ids = tf.constant([0], dtype=tf.int32)
    mel_outputs = features['mel']
    mel_lengths = features['len_mel'][:, 0]
    guided = features['g']

    model = tacotron2.Model(
        [input_ids, input_lengths],
        [mel_outputs, mel_lengths],
        len(MALAYA_SPEECH_SYMBOLS),
    )

    r = model.decoder_logits['outputs']
    decoder_output, post_mel_outputs, alignment_histories, _, _, _ = r
    stop_token_predictions = model.decoder_logits['stop_token_prediction']
    stop_token_predictions = stop_token_predictions[:, :, 0]

    binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    mae = tf.keras.losses.MeanAbsoluteError()

    mel_loss_before = calculate_3d_loss(mel_outputs,
                                        decoder_output,
                                        loss_fn=mae)
    mel_loss_after = calculate_3d_loss(mel_outputs,
                                       post_mel_outputs,
                                       loss_fn=mae)
    max_mel_length = tf.reduce_max(mel_lengths)
    stop_gts = tf.expand_dims(
        tf.range(tf.reduce_max(max_mel_length), dtype=tf.int32), 0)
    stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1])
    stop_gts = tf.cast(
        tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)),
        tf.float32,
    )
    stop_token_loss = calculate_2d_loss(stop_gts,
                                        stop_token_predictions,
                                        loss_fn=binary_crossentropy)
    attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32)
    loss_att = tf.reduce_sum(tf.abs(alignment_histories * guided) *
                             attention_masks,
                             axis=[1, 2])
    loss_att /= tf.reduce_sum(attention_masks, axis=[1, 2])
    loss_att = tf.reduce_mean(loss_att)

    loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att

    tf.identity(loss, 'loss')
    tf.identity(stop_token_loss, name='stop_token_loss')
    tf.identity(mel_loss_before, name='mel_loss_before')
    tf.identity(mel_loss_after, name='mel_loss_after')
    tf.identity(loss_att, name='loss_att')

    tf.summary.scalar('stop_token_loss', stop_token_loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('loss_att', loss_att)

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.optimize_loss(
            loss,
            tf.train.AdamOptimizer,
            parameters['optimizer_params'],
            learning_rate_scheduler,
            summaries=[
                'learning_rate',
                'variables',
                'gradients',
                'larc_summaries',
                'variable_norm',
                'gradient_norm',
                'global_gradient_norm',
            ],
            larc_params=parameters.get('larc_params', None),
            loss_scaling=parameters.get('loss_scaling', 1.0),
            loss_scaling_params=parameters.get('loss_scaling_params', None),
            clip_gradients=parameters.get('max_grad_norm', None),
        )
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss)

    return estimator_spec

Esempio n. 7

0

Mostra file

File: hifigan-male-generator.py Progetto: xzm2004260/malaya-speech

from malaya_speech.train.model import stft
import malaya_speech.config
from malaya_speech.train.loss import calculate_2d_loss, calculate_3d_loss

hifigan_config = malaya_speech.config.hifigan_config
generator = hifigan.Generator(
    hifigan.GeneratorConfig(**hifigan_config['hifigan_generator_params']),
    name='hifigan_generator',
)

stft_loss = stft.loss.MultiResolutionSTFT(**hifigan_config['stft_loss_params'])

y_hat = generator(features['mel'], training=True)
audios = features['audio']

sc_loss, mag_loss = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), stft_loss)

sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss)
mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss)

generator_loss = 0.5 * (sc_loss + mag_loss)
generator_loss = tf.reduce_mean(generator_loss)

global_step = tf.train.get_or_create_global_step()

print(global_step)

g_boundaries = [100_000, 200_000, 300_000, 400_000, 500_000, 600_000, 700_000]
g_values = [
    0.0005,
    0.0005,