def generate(self, mel): """Inference mode (Generates an audio waveform from a mel-spectrogram) """ wav = [] gru_cell = _init_GRUCell(self.rnn) # Conditioning network mel, _ = self.conditioning_network(mel) # Upsampling mel = F.interpolate(mel.transpose(1, 2), scale_factor=self.hop_length) mel = mel.transpose(1, 2) h = torch.zeros(mel.size(0), self.rnn_size, device=mel.device) x = torch.zeros(mel.size(0), device=mel.device, dtype=torch.long) x = x.fill_(2**(self.bits - 1)) for mel_frame in torch.unbind(mel, dim=1): # Audio embedding x = self.quantized_audio_embedding(x) # Autoregressive GRU Cell h = gru_cell(torch.cat((x, mel_frame), dim=1), h) x = F.relu(self.linear_layer(x)) logits = self.output_layer(x) # Apply softmax over the logits and generate a distribution posterior = F.softmax(logits, dim=1) dist = torch.distributions.Categorical(posterior) # Sample from the distribution to generate output x = dist.sample() wav.append(x.item()) wav = np.asarray(wav, dtype=np.int) wav = librosa.mu_expand(wav - 2**(self.num_bits - 1), mu=2**self.num_bits - 1) return wav
def run(self, seed, num_samples, gc=None, y_len=1, disp_interval=None, label=0): insert_point = self.model.receptive_field batch_size = seed.size(0) seed = seed.view(batch_size, -1) label = torch.Tensor([label] * batch_size).view(1, -1).long().to( self.model.device) with torch.no_grad(): if gc == None: x = torch.zeros(batch_size, num_samples).long().to(self.model.device) x = torch.cat((seed, x), dim=-1) else: if len(gc) != self.model.receptive_field: raise ValueError( "The length of global condition does't match.") x = torch.cat((gc, torch.zeros(num_samples)), 0) while insert_point < self.model.receptive_field + num_samples: x[:, insert_point:insert_point + y_len] = self.predict( x[:, insert_point - self.model.receptive_field:insert_point], label).view(batch_size, -1) if insert_point - self.model.receptive_field % disp_interval == 0: print('Finish {}/{}'.format( insert_point - self.model.receptive_field + 1, num_samples)) print('Finish {} steps.'.format(insert_point - self.model.receptive_field)) insert_point += y_len # move output from [0, 256] to [-128, 127] out = x[:, self.model.receptive_field:] - 128 out = librosa.mu_expand(out.cpu().numpy(), quantize=True) return out
def generate(self, mel): r""" Generates an audio waverform from a log-Mel spectrogram. Parameters: mel (Tensor): of shape (1, seq_len, n_mels) containing the log-Mel spectrogram. Returns: Tuple[np.array, int]: The resulting waveform of shape (seq_len * hop_length) and sample rate in Hz. """ wav = [] cell = get_gru_cell(self.rnn2) mel, _ = self.rnn1(mel) mel = F.interpolate(mel.transpose(1, 2), scale_factor=self.hop_length) mel = mel.transpose(1, 2) h = torch.zeros(mel.size(0), self.rnn_size, device=mel.device) x = torch.zeros(mel.size(0), device=mel.device, dtype=torch.long) x = x.fill_(2**(self.bits - 1)) for m in tqdm(torch.unbind(mel, dim=1), leave=False): x = self.embedding(x) h = cell(torch.cat((x, m), dim=1), h) x = F.relu(self.fc1(h)) logits = self.fc2(x) posterior = F.softmax(logits, dim=1) dist = torch.distributions.Categorical(posterior) x = dist.sample() wav.append(x.item()) wav = np.asarray(wav, dtype=np.int) wav = librosa.mu_expand(wav - 2**(self.bits - 1), mu=2**self.bits - 1) return wav, self.sr
def mulaw_decode(samples): # Rescale from 0..255 to -128..127. Decode to -1.0..1.0. Return -2**15-1..2**15-1. return (librosa.mu_expand(samples.astype('int16') - 128, quantize=True) * (2**15 - 1)).astype('int16')
def mu_expand(x: np.array, p): "Mu expand from C, W in [-1., 1.] to C, W in [-1., 1.]" return librosa.mu_expand(x, mu=p.n_classes - 1, quantize=False)