Ejemplo n.º 1
0
    def decode(self, latent, features):
        # Prepare the inputs.
        latents_at_frame_rate = utils.upsample_to_repetitions(latent, features['segment_n_frames'])
        phones_at_frame_rate = utils.upsample_to_repetitions(features['phones'], features['dur']).type(torch.float)
        norm_counters = features['normalised_counters']

        decoder_inputs = torch.cat((latents_at_frame_rate, phones_at_frame_rate, norm_counters), dim=-1)

        # Run the decoder.
        pred_norm_lf0_deltas, _ = self.decoder_layer(decoder_inputs, seq_len=features['n_frames'])

        # Prepare the outputs.
        pred_lf0_deltas = self.normalisers['lf0'].denormalise(pred_norm_lf0_deltas, deltas=True)

        # MLPG to select the most probable trajectory given the delta and delta-delta features.
        pred_lf0 = MLPG(means=pred_lf0_deltas,
                        variances=self.normalisers['lf0'].delta_params['std_dev'] ** 2)

        outputs = {
            'normalised_lf0_deltas': pred_norm_lf0_deltas,
            'lf0_deltas': pred_lf0_deltas,
            'lf0': pred_lf0
        }

        sentence_f0 = torch.exp(features['lf0'])
        segment_f0 = utils.split_to_segments(sentence_f0, features['segment_n_frames'])
        segment_mean_f0 = torch.sum(segment_f0, dim=2) / features['segment_n_frames'].type(segment_f0.dtype)

        self.metrics.accumulate(self.mode,
                                embeddings=(latent, features['n_segments']),
                                name=[features['name']],
                                n_segments=features['n_segments'],
                                segment_mean_F0=(segment_mean_f0, features['n_segments']))

        return outputs
Ejemplo n.º 2
0
    def predict(self, features):
        # Prepare inputs.
        norm_lab = features['normalised_lab']
        dur = features['dur']
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur)

        norm_counters = features['normalised_counters']
        model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters), dim=-1)

        # Run the encoder.
        n_frames = features['n_frames']
        pred_norm_lf0_deltas = self.recurrent_layers(model_inputs, seq_len=n_frames)

        # Prepare the outputs.
        pred_lf0_deltas = self.normalisers['lf0'].denormalise(pred_norm_lf0_deltas, deltas=True)

        # MLPG to select the most probable trajectory given the delta and delta-delta features.
        pred_lf0 = MLPG(means=pred_lf0_deltas,
                        variances=self.normalisers['lf0'].delta_params['std_dev'] ** 2)

        outputs = {
            'normalised_lf0_deltas': pred_norm_lf0_deltas,
            'lf0_deltas': pred_lf0_deltas,
            'lf0': pred_lf0
        }

        return outputs
Ejemplo n.º 3
0
    def decode(self, latent, features):
        # Prepare the inputs.
        n_frames = features['n_frames']
        max_n_frames = torch.max(n_frames)
        latents_at_frame_rate = latent.unsqueeze(1).repeat(1, max_n_frames, 1)

        norm_lab = features['normalised_lab']
        dur = features['dur']
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur)

        norm_counters = features['normalised_counters']
        decoder_inputs = torch.cat(
            (latents_at_frame_rate, norm_lab_at_frame_rate, norm_counters),
            dim=-1)

        # Run the decoder.
        pred_norm_lf0_deltas = self.decoder_layer(decoder_inputs,
                                                  seq_len=n_frames)

        # Prepare the outputs.
        pred_lf0_deltas = self.normalisers['lf0'].denormalise(
            pred_norm_lf0_deltas, deltas=True)

        # MLPG to select the most probable trajectory given the delta and delta-delta features.
        pred_lf0 = MLPG(
            means=pred_lf0_deltas,
            variances=self.normalisers['lf0'].delta_params['std_dev']**2)

        outputs = {
            'normalised_lf0_deltas': pred_norm_lf0_deltas,
            'lf0_deltas': pred_lf0_deltas,
            'lf0': pred_lf0
        }

        return outputs
Ejemplo n.º 4
0
    def predict(self, features):
        # Prepare inputs.
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(
            features['normalised_lab'], features['dur'])
        model_inputs = torch.cat(
            (norm_lab_at_frame_rate, features['normalised_counters']), dim=-1)
        n_frames = features['n_frames']

        # Run the encoder.
        pred_norm_lf0_deltas = self.recurrent_layers(model_inputs,
                                                     seq_len=n_frames)

        # Prepare the outputs.
        pred_lf0_deltas = self.normalisers['lf0'].denormalise(
            pred_norm_lf0_deltas, deltas=True)

        # MLPG to select the most probable trajectory given the delta and delta-delta features.
        global_variance = self.normalisers['lf0'].delta_params['std_dev']**2
        pred_lf0 = viz.synthesis.MLPG(pred_lf0_deltas,
                                      global_variance,
                                      padding_size=100,
                                      seq_len=n_frames)

        outputs = {'lf0_deltas': pred_lf0_deltas, 'lf0': pred_lf0}

        return outputs
Ejemplo n.º 5
0
    def predict(self, features):
        # Prepare inputs.
        norm_lab = features['normalised_lab']
        dur = features['dur']
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur)

        norm_counters = features['normalised_counters']
        model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters),
                                 dim=-1)

        # Run the encoder.
        n_frames = features['n_frames']
        predicted = self.recurrent_layers(model_inputs, seq_len=n_frames)

        # Extract the mixing components, means, and standard deviations from the prediction.
        i, j = self.n_components, self.n_components * self.output_dim
        pi = predicted[..., :i]
        means = torch.split(predicted[..., i:i + j], self.output_dim, dim=-1)
        log_variances = torch.split(predicted[..., i + j:],
                                    self.output_dim,
                                    dim=-1)

        # Set a variance floor.
        log_variances = [
            torch.clamp(log_variance, min=np.log(self.var_floor))
            for log_variance in log_variances
        ]

        # Reparameterisation: mixing coefficients should be a distribution, standard deviation should be non-negative.
        pi = F.softmax(pi, dim=-1)
        std_devs = [
            torch.exp(log_variance * 0.5) for log_variance in log_variances
        ]

        # Prepare the outputs.
        pred_norm_lf0_deltas_GMM = GaussianMixtureModel(pi, means, std_devs)

        # Take the most likely components and find the most probable trajectory using MLPG.
        pred_norm_lf0_deltas_mean, pred_norm_lf0_deltas_std_dev = pred_norm_lf0_deltas_GMM.argmax_components(
        )
        pred_norm_lf0_deltas_var = pred_norm_lf0_deltas_std_dev**2

        pred_norm_lf0 = MLPG(means=pred_norm_lf0_deltas_mean,
                             variances=pred_norm_lf0_deltas_var)
        pred_lf0 = self.normalisers['lf0'].denormalise(pred_norm_lf0)

        outputs = {
            'normalised_lf0_deltas_GMM': pred_norm_lf0_deltas_GMM,
            'lf0': pred_lf0
        }

        return outputs
Ejemplo n.º 6
0
    def encode(self, features):
        # Prepare inputs.
        norm_lab = features['normalised_lab']
        dur = features['dur']
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur)

        norm_lf0_deltas = features['normalised_lf0_deltas']
        norm_counters = features['normalised_counters']
        encoder_inputs = torch.cat((norm_lf0_deltas, norm_lab_at_frame_rate, norm_counters), dim=-1)

        # Run the encoder.
        n_frames = features['n_frames']
        mean, log_variance = self.encoder_layer(encoder_inputs, seq_len=n_frames)

        return mean, log_variance
Ejemplo n.º 7
0
    def predict(self, features):
        # Prepare inputs.
        norm_lab = features['normalised_lab']
        dur = features['dur']
        norm_lab_at_frame_rate = utils.upsample_to_repetitions(norm_lab, dur)

        norm_counters = features['normalised_counters']
        model_inputs = torch.cat((norm_lab_at_frame_rate, norm_counters),
                                 dim=-1)

        # Run the model.
        n_frames = features['n_frames']
        pred_norm_deltas = self.layers(model_inputs, seq_len=n_frames)

        # Prepare the outputs.
        output_dims = [
            self.output_dims[n] for n in ['lf0', 'vuv', 'mcep', 'bap']
        ]
        pred_norm_lf0_deltas, pred_vuv, pred_norm_mcep_deltas, pred_norm_bap_deltas = \
            torch.split(pred_norm_deltas, output_dims, dim=-1)

        pred_lf0 = self._prepare_output('lf0', pred_norm_lf0_deltas)
        pred_mcep = self._prepare_output('mcep', pred_norm_mcep_deltas)
        pred_bap = self._prepare_output('bap', pred_norm_bap_deltas)

        pred_vuv = torch.sigmoid(pred_vuv)

        outputs = {
            'normalised_lf0_deltas': pred_norm_lf0_deltas,
            'normalised_mcep_deltas': pred_norm_mcep_deltas,
            'normalised_bap_deltas': pred_norm_bap_deltas,
            'lf0': pred_lf0,
            'vuv': pred_vuv,
            'mcep': pred_mcep,
            'bap': pred_bap,
        }

        return outputs