Beispiel #1
0
 def generate(self,
              synth,
              h_0,
              f0_hz,
              enc_frame_setting='fine',
              n_samples=16000):
     """
     synth:          synth to generate audio
     h_0:            initial state of RNN [batch, latent_dims]
     f0_hz:          f0 conditioning of synth [batch, f0_n_frames, 1]
     enc_frame_setting: fft/hop size
     n_samples:      output audio length in samples
     """
     h = h_0
     n_fft, hop_length = get_window_hop(enc_frame_setting)
     n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1
     f0_hz = resample_frames(f0_hz,
                             n_frames)  # needs to have same dimension as z
     params_list = []
     z = torch.zeros(h_0.shape[0], n_frames, self.latent_dims).to(h.device)
     for t in range(n_frames):
         # prior distribution with rnn information
         mu_p_t, scale_p_t = self.get_prior(h)
         prior_t = Independent(Normal(mu_p_t, scale_p_t), 1)
         prior_sample_t = prior_t.rsample()
         h = self.temporal(prior_sample_t, h)
         z[:, t, :] = prior_sample_t
     cond = {}
     cond['z'] = z
     cond['f0_hz'] = f0_hz
     y_params = self.decode(cond)
     params = synth.fill_params(y_params, cond)
     resyn_audio, outputs = synth(params, n_samples)
     return params, resyn_audio
    def generate(self,
                 synth,
                 h_0,
                 f0_hz,
                 attributes,
                 enc_frame_setting='fine',
                 n_samples=16000):
        """
        synth:          synth to generate audio
        h_0:            initial seed of RNN [batch, latent_dims]
        f0_hz:          f0 conditioning of synth [batch, f0_n_frames, 1]
        attributes:     attributes [batch, n_frames, attribute_size]
        enc_frame_setting: fft/hop size
        n_samples:      output audio length in samples
        """
        if len(h_0.shape) == 2:
            h = h_0[None, :, :]  # 1, batch, latent_dims
        else:
            h = h_0
        n_fft, hop_length = get_window_hop(enc_frame_setting)
        n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1
        f0_hz = resample_frames(f0_hz,
                                n_frames)  # needs to have same dimension as z
        params_list = []
        for i in range(n_frames):
            cond = {}
            output = torch.cat([h.permute(1, 0, 2), attributes], dim=-1)
            mu, logscale = self.psi_p(output, output)
            scale = logscale.exp()
            prior = Independent(Normal(mu, scale), 1)
            prior_sample = prior.rsample()
            cond['z'] = prior_sample
            cond['f0_hz'] = f0_hz[:, i, :].unsqueeze(1)
            cond['f0_scaled'] = hz_to_midi(cond['f0_hz']) / 127.0
            # generate x
            y = self.decode(cond)
            params = synth.fill_params(y, cond)
            params_list.append(params)
            x_tilde, _outputs = synth(
                params, n_samples=n_fft)  # write exactly one frame
            cond['audio'] = x_tilde
            # encode
            cond = self.encoder(cond)
            z_enc = cond['z']
            # get psi_q
            mu, logscale = self.psi_q(z_enc, z_enc)
            psi = torch.cat([mu, logscale], dim=-1)
            # temporal model
            temp_q, h = self.temporal_q(psi, h)  # one off

        param_names = params_list[0].keys()
        final_params = {}
        for pn in param_names:
            #cat over frames
            final_params[pn] = torch.cat([par[pn] for par in params_list],
                                         dim=1)

        final_audio, _outputs = synth(final_params, n_samples=n_samples)
        return final_params, final_audio
 def expand(self, cond, time_steps):
     """Make sure some conditioning has same temporal resolution as other conditioning."""
     # Add time dim of z if necessary.
     if len(cond.shape) == 2:
         cond = cond[:, None, :]
     # Expand time dim of cond if necessary.
     cond_time_steps = int(cond.shape[1])
     if cond_time_steps != time_steps:
         cond = resample_frames(cond, time_steps)
     return cond
    def generate(self,
                 synth,
                 h_0,
                 f0_hz,
                 enc_frame_setting='fine',
                 n_samples=16000):
        """
        synth:          synth to generate audio
        h_0:            initial state of RNN [batch, latent_dims]
        f0_hz:          f0 conditioning of synth [batch, f0_n_frames, 1]
        enc_frame_setting: fft/hop size
        n_samples:      output audio length in samples
        """
        h = h_0
        n_fft, hop_length = get_window_hop(enc_frame_setting)
        n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1
        f0_hz = resample_frames(f0_hz,
                                n_frames)  # needs to have same dimension as z
        params_list = []
        z = torch.zeros(h_0.shape[0], n_frames, self.latent_dims).to(h.device)
        for t in range(n_frames):
            h_mu, h_scale = self.h_process(h, h)
            mu_t, logscale_t = self.psi_p(h_mu,
                                          h_scale)  # [batch, latent_size]
            scale_t = logscale_t.exp()
            prior_t = Independent(Normal(mu_t, scale_t), 1)
            prior_sample_t = prior_t.rsample()
            cond = {}
            z[:, t, :] = prior_sample_t
            cond['z'] = prior_sample_t.unsqueeze(1)
            cond['f0_hz'] = f0_hz[:, t, :].unsqueeze(1)
            cond['f0_scaled'] = hz_to_midi(cond['f0_hz']) / 127.0
            # generate x
            y = self.decode(cond)
            params = synth.fill_params(y, cond)
            params_list.append(params)
            x_tilde, _outputs = synth(
                params, n_samples=n_fft)  # write exactly one frame
            cond['audio'] = x_tilde
            # encode
            cond = self.encoder(cond)
            z_enc = cond['z'].squeeze(1)
            # get psi_q
            mu, logscale = self.psi_q(z_enc, z_enc)
            rnn_input = torch.cat([mu, logscale, prior_sample_t], dim=-1)
            # temporal model
            h = self.temporal_q(rnn_input, h)  # one off

        cond = {}
        cond['z'] = z
        cond['f0_hz'] = f0_hz
        y_params = self.decode(cond)
        params = synth.fill_params(y_params, cond)
        resyn_audio, outputs = synth(params, n_samples)
        return params, resyn_audio
    def forward(self, amplitudes, frequencies, n_samples=None):
        """Synthesize audio with sinusoid oscillators

        Args:
        amplitudes: Amplitude tensor of shape [batch, n_frames, n_sinusoids].
        frequencies: Tensor of shape [batch, n_frames, n_sinusoids].

        Returns:
        signal: A tensor of harmonic waves of shape [batch, n_samples].
        """
        if n_samples is None:
            n_samples = self.n_samples
        # Scale the amplitudes.
        if self.amp_scale_fn is not None:
            amplitudes = self.amp_scale_fn(amplitudes)
        if self.freq_scale_fn is not None:
            frequencies = self.freq_scale_fn(frequencies)

        # resample to n_samples
        amplitudes_envelope = util.resample_frames(amplitudes, n_samples)
        frequency_envelope = util.resample_frames(frequencies, n_samples)

        signal = util.oscillator_bank(frequency_envelope, amplitudes_envelope, self.sample_rate)
        return signal
Beispiel #6
0
 def generate(self,
              synth,
              h_0,
              f0_hz,
              attributes,
              enc_frame_setting='fine',
              n_samples=16000):
     """
     synth:          synth to generate audio
     h_0:            initial state of RNN [batch, latent_dims]
     f0_hz:          f0 conditioning of synth [batch, f0_n_frames, 1]
     attributes:     attributes [batch, attribute_size] or [batch, n_frames, attribute_size]
     enc_frame_setting: fft/hop size
     n_samples:      output audio length in samples
     """
     n_fft, hop_length = get_window_hop(enc_frame_setting)
     n_frames = math.ceil((n_samples - n_fft) / hop_length) + 1
     f0_hz = resample_frames(f0_hz, n_frames).to(
         h_0.device)  # needs to have same dimension as z
     params_list = []
     z = torch.zeros(h_0.shape[0], n_frames,
                     self.latent_dims).to(h_0.device)
     if len(attributes.shape) == 2:
         attributes = attributes[:, None, :].expand(-1, n_frames, -1)
     # set up initial prior with attributes
     z_t = torch.zeros(h_0.shape[0], self.latent_dims).to(h_0.device)
     rnn_input = torch.cat([z_t, attributes[:, 0, :]], dim=-1)
     h = self.temporal(rnn_input, h_0)
     for t in range(n_frames):
         # prior distribution with rnn information
         mu_p_t, scale_p_t = self.get_prior(h)
         prior_t = Independent(Normal(mu_p_t, scale_p_t), 1)
         z_t = prior_t.rsample()
         rnn_input = torch.cat([z_t, attributes[:, t, :]], dim=-1)
         h = self.temporal(rnn_input, h)
         z[:, t, :] = z_t
     cond = {}
     cond['z'] = z
     cond['f0_hz'] = f0_hz
     y_params = self.decode(cond)
     params = synth.fill_params(y_params, cond)
     resyn_audio, outputs = synth(params, n_samples)
     return params, resyn_audio