コード例 #1
0
def compute_per_example_discriminator_losses(features):
    y_mb_hat = generator(features['mel'], training=True)
    audios = features['audio']
    y_hat = pqmf.synthesis(y_mb_hat)
    y = tf.expand_dims(audios, 2)
    p = discriminator(y)
    p_hat = discriminator(y_hat)

    real_loss = 0.0
    fake_loss = 0.0
    for i in range(len(p)):
        real_loss += calculate_3d_loss(tf.ones_like(p[i][-1]),
                                       p[i][-1],
                                       loss_fn=mse_loss)
        fake_loss += calculate_3d_loss(tf.zeros_like(p_hat[i][-1]),
                                       p_hat[i][-1],
                                       loss_fn=mse_loss)
    real_loss /= i + 1
    fake_loss /= i + 1
    dis_loss = real_loss + fake_loss

    per_example_losses = dis_loss

    dict_metrics_losses = {
        'real_loss': real_loss,
        'fake_loss': fake_loss,
        'dis_loss': dis_loss,
    }

    return per_example_losses, dict_metrics_losses
コード例 #2
0
def compute_per_example_generator_losses(audios, outputs):
    y_hat = outputs
    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]),
                                      p_hat[i][-1],
                                      loss_fn=mse_loss)
    adv_loss /= i + 1

    fm_loss = 0.0
    for i in range(len(p_hat)):
        for j in range(len(p_hat[i]) - 1):
            fm_loss += calculate_3d_loss(p[i][j],
                                         p_hat[i][j],
                                         loss_fn=mae_loss)
    fm_loss /= (i + 1) * (j + 1)
    adv_loss += 10 * fm_loss

    per_example_losses = adv_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'fm_loss': fm_loss,
        'gen_loss': adv_loss,
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses
コード例 #3
0
def compute_per_example_discriminator_losses(audios, gen_outputs):
    y_hat = gen_outputs
    y = tf.expand_dims(audios, 2)
    p = discriminator(y)
    p_hat = discriminator(y_hat)

    real_loss = 0.0
    fake_loss = 0.0
    for i in range(len(p)):
        real_loss += calculate_3d_loss(tf.ones_like(p[i][-1]),
                                       p[i][-1],
                                       loss_fn=mse_loss)
        fake_loss += calculate_3d_loss(tf.zeros_like(p_hat[i][-1]),
                                       p_hat[i][-1],
                                       loss_fn=mse_loss)
    real_loss /= i + 1
    fake_loss /= i + 1
    dis_loss = real_loss + fake_loss

    per_example_losses = dis_loss

    dict_metrics_losses = {
        'real_loss': real_loss,
        'fake_loss': fake_loss,
        'dis_loss': dis_loss,
    }

    return per_example_losses, dict_metrics_losses
コード例 #4
0
ファイル: autovc.py プロジェクト: xiaozhuo12138/malaya-speech
def model_fn(features, labels, mode, params):
    vectors = features['v']
    mels = features['mel']
    mels_len = features['mel_length'][:, 0]
    model = autovc.Model(dim_neck=32, dim_pre=512, freq=32)
    encoder_outputs, mel_before, mel_after, codes = model(
        mels, vectors, vectors)
    codes_ = model.call_second(mel_after, vectors)
    loss_f = tf.losses.absolute_difference
    max_length = tf.cast(tf.reduce_max(mels_len), tf.int32)
    mask = tf.sequence_mask(lengths=mels_len,
                            maxlen=max_length,
                            dtype=tf.float32)
    mask = tf.expand_dims(mask, axis=-1)

    mse_mel = partial(loss_f, weights=mask)
    mel_loss_before = calculate_3d_loss(mels, mel_before, mse_mel)
    mel_loss_after = calculate_3d_loss(mels, mel_after, mse_mel)
    g_loss_cd = tf.losses.absolute_difference(codes, codes_)
    loss = mel_loss_before + mel_loss_after + g_loss_cd

    tf.identity(loss, 'total_loss')
    tf.identity(mel_loss_before, 'mel_loss_before')
    tf.identity(mel_loss_after, 'mel_loss_after')
    tf.identity(g_loss_cd, 'g_loss_cd')

    tf.summary.scalar('total_loss', loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('g_loss_cd', g_loss_cd)

    global_step = tf.train.get_or_create_global_step()

    if mode == tf.estimator.ModeKeys.TRAIN:

        optimizer = tf.train.AdamOptimizer(learning_rate=0.0001)

        train_op = optimizer.minimize(loss, global_step=global_step)
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss)

    return estimator_spec
コード例 #5
0
def compute_per_example_generator_losses(features):
    y_hat = generator(features['mel'], training = True)
    audios = features['audio']

    sc_loss, mag_loss = calculate_2d_loss(
        audios, tf.squeeze(y_hat, -1), stft_loss
    )

    sc_loss = tf.where(sc_loss >= 15.0, tf.zeros_like(sc_loss), sc_loss)
    mag_loss = tf.where(mag_loss >= 15.0, tf.zeros_like(mag_loss), mag_loss)

    generator_loss = 0.5 * (sc_loss + mag_loss)

    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += calculate_3d_loss(
            tf.ones_like(p_hat[i][-1]), p_hat[i][-1], loss_fn = mse_loss
        )
    adv_loss /= i + 1

    fm_loss = 0.0
    for i in range(len(p_hat)):
        for j in range(len(p_hat[i]) - 1):
            fm_loss += calculate_3d_loss(
                p[i][j], p_hat[i][j], loss_fn = mae_loss
            )

    fm_loss /= (i + 1) * (j + 1)
    adv_loss += 10.0 * fm_loss
    generator_loss += 4.0 * adv_loss

    per_example_losses = generator_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn = mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'fm_loss': fm_loss,
        'gen_loss': tf.reduce_mean(generator_loss),
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses
コード例 #6
0
def compute_per_example_generator_losses(features):
    y_mb_hat = generator(features['mel'], training=True)
    audios = features['audio']
    y_hat = pqmf.synthesis(y_mb_hat)

    y_mb = pqmf.analysis(tf.expand_dims(audios, -1))
    y_mb = tf.transpose(y_mb, (0, 2, 1))
    y_mb = tf.reshape(y_mb, (-1, tf.shape(y_mb)[-1]))

    y_mb_hat = tf.transpose(y_mb_hat, (0, 2, 1))
    y_mb_hat = tf.reshape(y_mb_hat, (-1, tf.shape(y_mb_hat)[-1]))
    sub_sc_loss, sub_mag_loss = calculate_2d_loss(y_mb, y_mb_hat,
                                                  sub_band_stft_loss)

    sub_sc_loss = tf.reduce_mean(tf.reshape(sub_sc_loss, [-1, pqmf.subbands]),
                                 -1)
    sub_mag_loss = tf.reduce_mean(
        tf.reshape(sub_mag_loss, [-1, pqmf.subbands]), -1)
    full_sc_loss, full_mag_loss = calculate_2d_loss(audios,
                                                    tf.squeeze(y_hat, -1),
                                                    full_band_stft_loss)

    generator_loss = 0.5 * (sub_sc_loss + sub_mag_loss) + 0.5 * (full_sc_loss +
                                                                 full_mag_loss)

    p_hat = discriminator(y_hat)
    p = discriminator(tf.expand_dims(audios, 2))

    adv_loss = 0.0
    for i in range(len(p_hat)):
        adv_loss += calculate_3d_loss(tf.ones_like(p_hat[i][-1]),
                                      p_hat[i][-1],
                                      loss_fn=mse_loss)
    adv_loss /= i + 1

    generator_loss += 2.5 * adv_loss

    per_example_losses = generator_loss

    a = calculate_2d_loss(audios, tf.squeeze(y_hat, -1), loss_fn=mels_loss)

    dict_metrics_losses = {
        'adversarial_loss': adv_loss,
        'gen_loss': tf.reduce_mean(generator_loss),
        'subband_spectral_convergence_loss': tf.reduce_mean(sub_sc_loss),
        'subband_log_magnitude_loss': tf.reduce_mean(sub_mag_loss),
        'fullband_spectral_convergence_loss': tf.reduce_mean(full_sc_loss),
        'fullband_log_magnitude_loss': tf.reduce_mean(full_mag_loss),
        'mels_spectrogram_loss': tf.reduce_mean(a),
    }

    return per_example_losses, dict_metrics_losses
コード例 #7
0
def model_fn(features, labels, mode, params):
    tacotron2_config = malaya_speech.config.tacotron2_config
    tacotron2_config['reduction_factor'] = reduction_factor
    c = tacotron2.Config(
        vocab_size = len(MALAYA_SPEECH_SYMBOLS) + 1, **tacotron2_config
    )
    model = tacotron2.Model(c)
    input_ids = features['text_ids']
    input_lengths = features['len_text_ids'][:, 0]
    speaker_ids = tf.constant([0], dtype = tf.int32)
    mel_outputs = features['mel']
    mel_lengths = features['len_mel'][:, 0]
    mel_actuals = features['mel']
    guided = features['g']
    r = model(
        input_ids,
        input_lengths,
        speaker_ids,
        mel_outputs,
        mel_lengths,
        training = True,
    )

    binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits = True)
    mae = tf.keras.losses.MeanAbsoluteError()

    decoder_output, post_mel_outputs, stop_token_predictions, alignment_histories = (
        r
    )
    mel_loss_before = calculate_3d_loss(
        mel_actuals, decoder_output, loss_fn = mae
    )
    mel_loss_after = calculate_3d_loss(
        mel_actuals, post_mel_outputs, loss_fn = mae
    )
    max_mel_length = tf.reduce_max(mel_lengths)
    stop_gts = tf.expand_dims(
        tf.range(tf.reduce_max(max_mel_length), dtype = tf.int32), 0
    )
    stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1])
    stop_gts = tf.cast(
        tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)),
        tf.float32,
    )
    stop_token_loss = calculate_2d_loss(
        stop_gts, stop_token_predictions, loss_fn = binary_crossentropy
    )
    attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32)
    loss_att = tf.reduce_sum(
        tf.abs(alignment_histories * guided) * attention_masks, axis = [1, 2]
    )
    loss_att /= tf.reduce_sum(attention_masks, axis = [1, 2])
    loss_att = tf.reduce_mean(loss_att)

    loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att

    tf.identity(loss, 'loss')
    tf.identity(stop_token_loss, name = 'stop_token_loss')
    tf.identity(mel_loss_before, name = 'mel_loss_before')
    tf.identity(mel_loss_after, name = 'mel_loss_after')
    tf.identity(loss_att, name = 'loss_att')

    tf.summary.scalar('stop_token_loss', stop_token_loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('loss_att', loss_att)

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.adamw.create_optimizer(
            loss = loss,
            init_lr = learning_rate,
            num_train_steps = num_train_steps,
            num_warmup_steps = num_warmup_steps,
            end_learning_rate = end_learning_rate,
            weight_decay_rate = weight_decay_rate,
        )
        estimator_spec = tf.estimator.EstimatorSpec(
            mode = mode, loss = loss, train_op = train_op
        )

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode = tf.estimator.ModeKeys.EVAL, loss = loss
        )

    return estimator_spec
コード例 #8
0
def model_fn(features, labels, mode, params):
    input_ids = features['text_ids']
    input_lengths = features['len_text_ids'][:, 0]
    speaker_ids = tf.constant([0], dtype=tf.int32)
    mel_outputs = features['mel']
    mel_lengths = features['len_mel'][:, 0]
    guided = features['g']

    model = tacotron2.Model(
        [input_ids, input_lengths],
        [mel_outputs, mel_lengths],
        len(MALAYA_SPEECH_SYMBOLS),
    )

    r = model.decoder_logits['outputs']
    decoder_output, post_mel_outputs, alignment_histories, _, _, _ = r
    stop_token_predictions = model.decoder_logits['stop_token_prediction']
    stop_token_predictions = stop_token_predictions[:, :, 0]

    binary_crossentropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
    mae = tf.keras.losses.MeanAbsoluteError()

    mel_loss_before = calculate_3d_loss(mel_outputs,
                                        decoder_output,
                                        loss_fn=mae)
    mel_loss_after = calculate_3d_loss(mel_outputs,
                                       post_mel_outputs,
                                       loss_fn=mae)
    max_mel_length = tf.reduce_max(mel_lengths)
    stop_gts = tf.expand_dims(
        tf.range(tf.reduce_max(max_mel_length), dtype=tf.int32), 0)
    stop_gts = tf.tile(stop_gts, [tf.shape(mel_lengths)[0], 1])
    stop_gts = tf.cast(
        tf.math.greater_equal(stop_gts, tf.expand_dims(mel_lengths, 1)),
        tf.float32,
    )
    stop_token_loss = calculate_2d_loss(stop_gts,
                                        stop_token_predictions,
                                        loss_fn=binary_crossentropy)
    attention_masks = tf.cast(tf.math.not_equal(guided, -1.0), tf.float32)
    loss_att = tf.reduce_sum(tf.abs(alignment_histories * guided) *
                             attention_masks,
                             axis=[1, 2])
    loss_att /= tf.reduce_sum(attention_masks, axis=[1, 2])
    loss_att = tf.reduce_mean(loss_att)

    loss = stop_token_loss + mel_loss_before + mel_loss_after + loss_att

    tf.identity(loss, 'loss')
    tf.identity(stop_token_loss, name='stop_token_loss')
    tf.identity(mel_loss_before, name='mel_loss_before')
    tf.identity(mel_loss_after, name='mel_loss_after')
    tf.identity(loss_att, name='loss_att')

    tf.summary.scalar('stop_token_loss', stop_token_loss)
    tf.summary.scalar('mel_loss_before', mel_loss_before)
    tf.summary.scalar('mel_loss_after', mel_loss_after)
    tf.summary.scalar('loss_att', loss_att)

    if mode == tf.estimator.ModeKeys.TRAIN:
        train_op = train.optimizer.optimize_loss(
            loss,
            tf.train.AdamOptimizer,
            parameters['optimizer_params'],
            learning_rate_scheduler,
            summaries=[
                'learning_rate',
                'variables',
                'gradients',
                'larc_summaries',
                'variable_norm',
                'gradient_norm',
                'global_gradient_norm',
            ],
            larc_params=parameters.get('larc_params', None),
            loss_scaling=parameters.get('loss_scaling', 1.0),
            loss_scaling_params=parameters.get('loss_scaling_params', None),
            clip_gradients=parameters.get('max_grad_norm', None),
        )
        estimator_spec = tf.estimator.EstimatorSpec(mode=mode,
                                                    loss=loss,
                                                    train_op=train_op)

    elif mode == tf.estimator.ModeKeys.EVAL:

        estimator_spec = tf.estimator.EstimatorSpec(
            mode=tf.estimator.ModeKeys.EVAL, loss=loss)

    return estimator_spec