def call(self, x, target_durations, training, durations_scalar=1.): padding_mask = create_encoder_padding_mask(x) x = self.encoder_prenet(x) x, encoder_attention = self.encoder(x, training=training, padding_mask=padding_mask, drop_n_heads=self.drop_n_heads) durations = self.dur_pred(x, training=training) * durations_scalar durations = (1. - tf.reshape(padding_mask, tf.shape(durations))) * durations if target_durations is not None: mels = self.expand(x, target_durations) else: mels = self.expand(x, durations) expanded_mask = create_mel_padding_mask(mels) mels = self.decoder_prenet(mels) mels, decoder_attention = self.decoder(mels, training=training, padding_mask=expanded_mask, drop_n_heads=self.drop_n_heads, reduction_factor=1) mels = self.out(mels) mels = self.decoder_postnet(mels, training=training) model_out = { 'mel': mels, 'duration': durations, 'expanded_mask': expanded_mask, 'encoder_attention': encoder_attention, 'decoder_attention': decoder_attention } return model_out
def _call_encoder(self, inputs, training): padding_mask = create_encoder_padding_mask(inputs) enc_input = inputs enc_output, attn_weights = self.encoder(enc_input, training=training, mask=padding_mask) return enc_output, padding_mask, attn_weights
def _call_encoder(self, inputs, training): padding_mask = create_encoder_padding_mask(inputs) enc_input = self.encoder_prenet(inputs) enc_output, attn_weights = self.encoder(enc_input, training=training, padding_mask=padding_mask, drop_n_heads=self.drop_n_heads) return enc_output, padding_mask, attn_weights
def _call_encoder(self, inputs, xvectors, training): #add xvectors padding_mask = create_encoder_padding_mask(inputs) enc_input = self.encoder_prenet(inputs) enc_output, attn_weights = self.encoder(enc_input, training=training, padding_mask=padding_mask, drop_n_heads=self.drop_n_heads) x_vec = self.enc_speaker_mod(xvectors) #mention axis is concatenation enc_output = tf.keras.layers.concatenate([enc_output, x_vec], axis=1) return enc_output, padding_mask, attn_weights
def call(self, x, target_durations, spk_emb, training, durations_scalar=1., max_durations_mask=None, min_durations_mask=None): encoder_padding_mask = create_encoder_padding_mask(x) x = self.encoder_prenet(x) x, encoder_attention = self.encoder(x, training=training, padding_mask=encoder_padding_mask, drop_n_heads=0) padding_mask = 1. - tf.squeeze(encoder_padding_mask, axis=(1, 2))[:, :, None] spk_emb = tf.math.softplus(self.speaker_fc(spk_emb)) spk_emb = tf.expand_dims(spk_emb, 1) x = x + spk_emb #tf.tile(pitch_embed, [1, tf.shape(x)[1], 1]) durations = self.dur_pred(x, training=training, mask=padding_mask) if target_durations is not None: use_durations = target_durations else: use_durations = durations * durations_scalar if max_durations_mask is not None: use_durations = tf.math.minimum( use_durations, tf.expand_dims(max_durations_mask, -1)) if min_durations_mask is not None: use_durations = tf.math.maximum( use_durations, tf.expand_dims(min_durations_mask, -1)) mels = self.expand(x, use_durations) expanded_mask = create_mel_padding_mask(mels) mels, decoder_attention = self.decoder(mels, training=training, padding_mask=expanded_mask, drop_n_heads=0) mels = self.out(mels) model_out = { 'mel': mels, 'duration': durations, 'expanded_mask': expanded_mask, 'encoder_attention': encoder_attention, 'decoder_attention': decoder_attention } return model_out
def get_durations_from_alignment(batch_alignments, mels, phonemes, weighted=False, binary=False, fill_gaps=False, fix_jumps=False, fill_mode='max'): """ :param batch_alignments: attention weights from autoregressive model. :param mels: mel spectrograms. :param phonemes: phoneme sequence. :param weighted: if True use weighted average of durations of heads, best head if False. :param binary: if True take maximum attention peak, sum if False. :param fill_gaps: if True fills zeros durations with ones. :param fix_jumps: if True, tries to scan alingments for attention jumps and interpolate. :param fill_mode: used only if fill_gaps is True. Is either 'max' or 'next'. Defines where to take the duration needed to fill the gap. Next takes it from the next non-zeros duration value, max from the sequence maximum. :return: """ assert (binary is True) or ( fix_jumps is False), 'Cannot fix jumps in non-binary attention.' mel_pad_mask = create_mel_padding_mask(mels) phon_pad_mask = create_encoder_padding_mask(phonemes) durations = [] # remove start end token or vector unpad_mels = [] unpad_phonemes = [] final_alignment = [] for i, al in enumerate(batch_alignments): mel_len = int(mel_pad_mask[i].shape[-1] - np.sum(mel_pad_mask[i])) phon_len = int(phon_pad_mask[i].shape[-1] - np.sum(phon_pad_mask[i])) unpad_alignments = al[:, 1:mel_len - 1, 1:phon_len - 1] # first dim is heads unpad_mels.append(mels[i, 1:mel_len - 1, :]) unpad_phonemes.append(phonemes[i, 1:phon_len - 1]) alignments_weights = weight_mask(unpad_alignments[0]) heads_scores = [] scored_attention = [] for _, attention_weights in enumerate(unpad_alignments): score = np.sum(alignments_weights * attention_weights) scored_attention.append(attention_weights / score) heads_scores.append(score) if weighted: ref_attention_weights = np.sum(scored_attention, axis=0) else: best_head = np.argmin(heads_scores) ref_attention_weights = unpad_alignments[best_head] if binary: # pick max attention for each mel time-step binary_attn, binary_score = binary_attention(ref_attention_weights) if fix_jumps: binary_attn = fix_attention_jumps( binary_attn=binary_attn, alignments_weights=alignments_weights, binary_score=binary_score) integer_durations = binary_attn.sum(axis=0) else: # takes actual attention values and normalizes to mel_len attention_durations = np.sum(ref_attention_weights, axis=0) normalized_durations = attention_durations * ( (mel_len - 2) / np.sum(attention_durations)) integer_durations = np.round(normalized_durations) tot_duration = np.sum(integer_durations) duration_diff = tot_duration - (mel_len - 2) while duration_diff != 0: rounding_diff = integer_durations - normalized_durations if duration_diff > 0: # duration is too long -> reduce highest (positive) rounding difference max_error_idx = np.argmax(rounding_diff) integer_durations[max_error_idx] -= 1 elif duration_diff < 0: # duration is too short -> increase lowest (negative) rounding difference min_error_idx = np.argmin(rounding_diff) integer_durations[min_error_idx] += 1 tot_duration = np.sum(integer_durations) duration_diff = tot_duration - (mel_len - 2) if fill_gaps: # fill zeros durations integer_durations = fill_zeros(integer_durations, take_from=fill_mode) assert np.sum( integer_durations ) == mel_len - 2, f'{np.sum(integer_durations)} vs {mel_len - 2}' new_alignment = duration_to_alignment_matrix( integer_durations.astype(int)) best_head = np.argmin(heads_scores) best_attention = unpad_alignments[best_head] final_alignment.append(best_attention.T + new_alignment) durations.append(integer_durations) return durations, unpad_mels, unpad_phonemes, final_alignment