Beispiel #1
0
    def generate(self, mel):
        """Inference mode (Generates an audio waveform from a mel-spectrogram)
        """
        wav = []
        gru_cell = _init_GRUCell(self.rnn)

        # Conditioning network
        mel, _ = self.conditioning_network(mel)

        # Upsampling
        mel = F.interpolate(mel.transpose(1, 2), scale_factor=self.hop_length)
        mel = mel.transpose(1, 2)

        h = torch.zeros(mel.size(0), self.rnn_size, device=mel.device)
        x = torch.zeros(mel.size(0), device=mel.device, dtype=torch.long)
        x = x.fill_(2**(self.bits - 1))

        for mel_frame in torch.unbind(mel, dim=1):
            # Audio embedding
            x = self.quantized_audio_embedding(x)

            # Autoregressive GRU Cell
            h = gru_cell(torch.cat((x, mel_frame), dim=1), h)

            x = F.relu(self.linear_layer(x))
            logits = self.output_layer(x)

            # Apply softmax over the logits and generate a distribution
            posterior = F.softmax(logits, dim=1)
            dist = torch.distributions.Categorical(posterior)

            # Sample from the distribution to generate output
            x = dist.sample()
            wav.append(x.item())

        wav = np.asarray(wav, dtype=np.int)
        wav = librosa.mu_expand(wav - 2**(self.num_bits - 1),
                                mu=2**self.num_bits - 1)

        return wav
    def run(self,
            seed,
            num_samples,
            gc=None,
            y_len=1,
            disp_interval=None,
            label=0):
        insert_point = self.model.receptive_field
        batch_size = seed.size(0)
        seed = seed.view(batch_size, -1)
        label = torch.Tensor([label] * batch_size).view(1, -1).long().to(
            self.model.device)
        with torch.no_grad():
            if gc == None:
                x = torch.zeros(batch_size,
                                num_samples).long().to(self.model.device)
                x = torch.cat((seed, x), dim=-1)
            else:
                if len(gc) != self.model.receptive_field:
                    raise ValueError(
                        "The length of global condition does't match.")
                x = torch.cat((gc, torch.zeros(num_samples)), 0)
            while insert_point < self.model.receptive_field + num_samples:
                x[:, insert_point:insert_point + y_len] = self.predict(
                    x[:,
                      insert_point - self.model.receptive_field:insert_point],
                    label).view(batch_size, -1)
                if insert_point - self.model.receptive_field % disp_interval == 0:
                    print('Finish {}/{}'.format(
                        insert_point - self.model.receptive_field + 1,
                        num_samples))
                print('Finish {} steps.'.format(insert_point -
                                                self.model.receptive_field))
                insert_point += y_len
        # move output from [0, 256] to [-128, 127]
        out = x[:, self.model.receptive_field:] - 128
        out = librosa.mu_expand(out.cpu().numpy(), quantize=True)

        return out
Beispiel #3
0
    def generate(self, mel):
        r"""
        Generates an audio waverform from a log-Mel spectrogram.

        Parameters:
            mel (Tensor): of shape (1, seq_len, n_mels) containing the log-Mel spectrogram.

        Returns:
            Tuple[np.array, int]: The resulting waveform of shape (seq_len * hop_length) and sample rate in Hz.
        """
        wav = []
        cell = get_gru_cell(self.rnn2)

        mel, _ = self.rnn1(mel)

        mel = F.interpolate(mel.transpose(1, 2), scale_factor=self.hop_length)
        mel = mel.transpose(1, 2)

        h = torch.zeros(mel.size(0), self.rnn_size, device=mel.device)
        x = torch.zeros(mel.size(0), device=mel.device, dtype=torch.long)
        x = x.fill_(2**(self.bits - 1))

        for m in tqdm(torch.unbind(mel, dim=1), leave=False):
            x = self.embedding(x)
            h = cell(torch.cat((x, m), dim=1), h)

            x = F.relu(self.fc1(h))
            logits = self.fc2(x)

            posterior = F.softmax(logits, dim=1)
            dist = torch.distributions.Categorical(posterior)

            x = dist.sample()
            wav.append(x.item())

        wav = np.asarray(wav, dtype=np.int)
        wav = librosa.mu_expand(wav - 2**(self.bits - 1), mu=2**self.bits - 1)
        return wav, self.sr
def mulaw_decode(samples):
    # Rescale from 0..255 to -128..127. Decode to -1.0..1.0. Return -2**15-1..2**15-1.
    return (librosa.mu_expand(samples.astype('int16') - 128, quantize=True) *
            (2**15 - 1)).astype('int16')
Beispiel #5
0
def mu_expand(x: np.array, p):
    "Mu expand from C, W in [-1., 1.] to C, W in [-1., 1.]"
    return librosa.mu_expand(x, mu=p.n_classes - 1, quantize=False)