Exemple #1
0
def main(audio_files, model_filename, output_dir, batch_size, speaker_id,
         implementation):
    audio_files = utils.files_to_list(audio_files)
    model = torch.load(model_filename)['model']
    model.eval()
    wavenet = nv_wavenet.NVWaveNet(
        **(model.decoders[speaker_id].export_weights()))

    for files in chunker(audio_files, batch_size):
        audio_ = []
        for file_path in files:
            print(file_path)
            audio, sampling_rate = utils.load_wav_to_torch(file_path)
            if sampling_rate != 16000:
                raise ValueError("{} SR doesn't match target {} SR".format(
                    sampling_rate, 16000))
            audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE, 256)
            audio = utils.to_gpu(audio)
            audio_.append(torch.unsqueeze(audio, 0))
        latent = model.get_latent_input(torch.cat(audio_, 0))
        cond_input = model.decoders[speaker_id].get_cond_input(latent)
        audio_data = wavenet.infer(cond_input, implementation)

        for i, file_path in enumerate(files):
            file_name = os.path.splitext(os.path.basename(file_path))[0]

            audio = utils.mu_law_decode_numpy(audio_data[i, :].cpu().numpy(),
                                              wavenet.A)
            audio = utils.MAX_WAV_VALUE * audio
            wavdata = audio.astype('int16')
            write("{}/{}.wav".format(output_dir, file_name), 16000, wavdata)
Exemple #2
0
 def __getitem__(self, index):
     # Read audio
     filename = self.audio_files[index]
     wav = deepaudio.load_wav(filename)
     # load in raw_audio via utils
     raw_audio, _ = utils.load_wav_to_torch(filename)
     # convert wav to numpy
     audio = torch.from_numpy(wav)
     # take segment
     if audio.size(0) >= self.segment_length:
         max_audio_start = audio.size(0) - self.segment_length
         audio_start = random.randint(0, max_audio_start)
         audio = audio[audio_start:audio_start + self.segment_length]
         # update raw audio as well
         raw_audio = raw_audio[audio_start:audio_start +
                               self.segment_length]
     else:
         audio = torch.nn.functional.pad(
             audio, (0, self.segment_length - audio.size(0)),
             'constant').data
         # pad raw audio as well
         raw_audio = torch.nn.functional.pad(
             raw_audio, (0, self.segment_length - raw_audio.size(0)),
             'constant').data
     # compute mel
     mel = deepaudio.melspectrogram(audio.numpy())
     # convert mel to torch
     mel = torch.from_numpy(mel)
     audio = utils.mu_law_encode(raw_audio / utils.MAX_WAV_VALUE,
                                 self.mu_quantization)
     return (mel, audio)
Exemple #3
0
def main(audio_file_path, model_filename, output_path):
    model = torch.load(model_filename, map_location=torch.device('cpu'))['model']
    
    # mels = []
    # for file_path in files:
    #     print(file_path)
    #     mel = torch.load(file_path)
    #     mel = utils.to_gpu(mel)
    #     mels.append(torch.unsqueeze(mel, 0))
    # cond_input = model.get_cond_input(torch.cat(mels, 0))
    # audio_data = wavenet.infer(cond_input, implementation)
    first_audio_data, _ = utils.load_wav_to_torch(audio_file_path)
    first_audio_data = first_audio_data[:10000]
    first_audio_data = utils.mu_law_encode(first_audio_data / utils.MAX_WAV_VALUE, 256)
    print("first_audio_data.shape", first_audio_data.shape)
    print("first_audio_data.shape", first_audio_data.dtype)
    audio_data = model.generate(first_samples = first_audio_data, num_samples=1000, receptive_field=6000)
    np.savetxt("audio_data.txt", audio_data.numpy().astype(int), fmt='%d')
    # for i, file_path in enumerate(files):
    # file_name = os.path.splitext(os.path.basename(file_path))[0]
    
    audio = utils.mu_law_decode_numpy(audio_data.cpu().numpy(), model.n_out_channels)
    audio = utils.MAX_WAV_VALUE * audio
    print("audio: ", audio)
    wavdata = audio.astype('int16')
    write(output_path, 16000, wavdata)
Exemple #4
0
    def __getitem__(self, index):
        audios = self.audio_buffer[index]
        rand_pos = np.random.randint(0, len(audios) - self.sample_size)

        if self.use_local_condition:
            local_condition = self.fbank_buffer[index]
            local_condition = np.repeat(local_condition,
                                        self.upsample_factor,
                                        axis=0)
            local_condition = local_condition[rand_pos:rand_pos +
                                              self.sample_size]
        else:
            audios = np.pad(audios, [[self.receptive_field, 0], [0, 0]],
                            'constant')
            local_condition = None

        audios = audios[rand_pos:rand_pos + self.sample_size]
        target = mu_law_encode(audios, self.quantization_channels)
        if self.noise_injecting:
            noise = np.random.normal(0.0, 1.0 / self.quantization_channels,
                                     audios.shape)
            audios = audios + noise

        audios = np.pad(audios, [[self.receptive_field, 0], [0, 0]],
                        'constant')
        local_condition = np.pad(local_condition,
                                 [[self.receptive_field, 0], [0, 0]],
                                 'constant')
        return torch.FloatTensor(audios), torch.LongTensor(
            target), torch.FloatTensor(local_condition)
Exemple #5
0
    def __getitem__(self, index):
        filename = self.audio_files[index]
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("Sampling rate doesn't math")

        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = F.pad(audio, (0, self.segment_length - audio.size(0)),
                          'constant').data

        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return audio
Exemple #6
0
    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

            # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return (mel, audio)
Exemple #7
0
 def quantize(self, wave):
     """convert the wave to a discrete integer format"""
     return mu_law_encode(torch.tensor(wave), self.mu_quantization)
Exemple #8
0
 def __call__(self, data):
     data['audio_quantized'] = mu_law_encode(
         data['audio'], quantization_channels=self.quantization_channels)
     return data
Exemple #9
0
    def inference(self,
                  cond_features,
                  use_logistic_mix=False,
                  teacher_audio=None,
                  mu_quantization=256,
                  randomize_input=False,
                  rand_sample_chance=0.,
                  length=0,
                  batch_size=0,
                  cond_channels=0,
                  device="cuda"):
        """
        Generates audio samples equivalent to the length of upsampled cond features
        - Will use teacher audio as forward input, if provided
        - If teacher_audio_length < features_length, switches forward input to inference 
              samples when teacher samples exhasted.
        - If cond_features=None, generates unconditional output. Last four params 
              (length, batch_size, cond_channels, device) control unconditional output.
        """

        assert ((cond_features is not None) or (length > 0))

        # get metadata from condition features
        if cond_features is not None:
            assert (len(cond_features.size()) == 3)

            device = cond_features.device
            length = cond_features.size(-1) * self.upscale
            cond_channels = cond_features.size(1)
            batch_size = cond_features.size(0)

            if (self.upscale != 1):
                cond_features = self.upsample(cond_features)

        else:
            assert (batch_size > 0 and cond_channels > 0)
            cond_features = torch.zeros(
                size=[batch_size, cond_channels, length]).to(device)

        if self.use_cond_conv:
            # make condition features for every timestep and res layer
            cond_features = self.cond_layers(cond_features)
        if not self.same_cond_each_resblock:
            cond_features = cond_features.view(batch_size, self.n_layers,
                                               2 * self.n_residual_channels,
                                               length)

        # output buffers
        logits = torch.zeros(self.n_out_channels, length).to(device)
        output_audio = torch.zeros(size=[batch_size, length + 1]).to(device)
        output_audio = utils.mu_law_encode(output_audio)

        if teacher_audio is not None:
            teacher_length = teacher_audio.size(1)
        else:
            teacher_length = 0

        if use_logistic_mix:
            sampler = SampleDiscretizedMixLogistics()
        else:
            sampler = utils.CategoricalSampler()

        #################
        # inference loop:
        ##################
        start_time = time.time()
        print("Inference progress:")
        for s in range(length - 1):

            # print progress every 100 samples
            if (s % 100 == 0):
                print(str(s / length), end='\r', flush=True)

            if self.use_cond_conv:
                cond_sample = cond_features[:, :, :, s]
            else:
                cond_sample = cond_features[:, :, s]

            # flip biased coin to see if raandom sample used
            if randomize_input and (random.uniform < rand_sample_chance):
                forward_sample = torch.randint_like(forward_sample,
                                                    low=0,
                                                    high=mu_quantization)
            else:
                # draw from teacher or previous sample?
                if (s < teacher_length):
                    forward_sample = teacher_audio[:, s].clone()
                else:
                    forward_sample = output_audio[:, s].clone()

            logits[:, s + 1] = self.infer_step(cond_sample, forward_sample)

            output_audio[:, s + 1] = sampler(logits[:, s + 1])

        end_time = time.time()
        ###################
        # end inference
        ###################

        print("Inference complete in " + str(end_time - start_time))

        return utils.mu_law_decode(output_audio, mu_quantization)
 def collect_features(self, path):
     # 1.Load audio --> 2. pre-emphasis --> 3. 8bit mu-law
     x, fs = librosa.load(path, sr=self.target_sr, mono=True, dtype=np.float64)
     x = x * 1.3
     x_mulaw = mu_law_encode(x)
     return x_mulaw.astype(np.uint8)
    def __getitem__(self, index):
        # Read audio
        audio_filename, mel_filename = self.audio_files[index]

        audio, sample_rate = utils.load_wav(audio_filename)
        pad_size = self.window_size - self.window_step
        left_pad = pad_size
        right_pad = pad_size + self.window_step - len(audio) % self.window_step
        audio = np.pad(audio, (left_pad, right_pad),
                       mode="constant",
                       constant_values=0)
        audio /= np.abs(audio).max()

        if self.apply_preemphasis:
            audio = self.preemphasis(audio)
            audio /= np.abs(audio).max()

        if sample_rate != self.sample_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sample_rate, self.sample_rate))
        if self.no_chunks:
            if self.load_mel:
                mel = np.load(mel_filename).T
            else:
                # as by default lws always pad from left and right
                mel = self.get_mel(audio[left_pad:-right_pad])
        else:
            if mel_filename != "" and self.load_mel:
                if self.segment_length % self.window_step != 0:
                    raise ValueError(
                        "Hop length should be a divider of segment length")
                mel = np.load(mel_filename)
                mel = np.clip(mel, -self.audio_params["max_abs_value"],
                              self.audio_params["max_abs_value"])
                # Take segment
                if mel.shape[0] >= self.mel_segment_length:
                    max_mel_start = mel.shape[0] - self.mel_segment_length
                    mel_start = random.randint(0, max_mel_start)
                    mel = mel[mel_start:mel_start + self.mel_segment_length]
                    assert mel.shape[0] == self.mel_segment_length
                    audio_start = mel_start * self.window_step
                    audio = audio[audio_start:audio_start +
                                  self.segment_length]
                    assert audio.shape[0] == self.segment_length
                else:
                    audio = np.pad(audio,
                                   (0, self.segment_length - audio.shape[0]),
                                   'constant')
                    mel = np.pad(
                        mel, (0, 0, 0, self.mel_segment_length - mel.shape[0]),
                        'constant')
            else:
                if audio.shape[0] >= self.segment_length:
                    max_audio_start = audio.shape[0] - self.segment_length
                    audio_start = random.randint(0, max_audio_start)
                    audio = audio[audio_start:audio_start +
                                  self.segment_length]
                else:
                    audio = np.pad(audio,
                                   (0, self.segment_length - audio.shape[0]),
                                   'constant')
                mel = self.get_mel(audio)

        mel_length = min(mel.shape[1], len(audio) // self.window_step)
        mel = mel[:, :mel_length]
        audio = audio[:mel_length * self.window_step]
        # as we want to apply transpose convolution
        assert len(audio) // self.window_step == mel.shape[1]
        mel = torch.FloatTensor(mel)
        audio = torch.FloatTensor(audio)
        audio = utils.mu_law_encode(audio, self.mu_quantization)
        return mel, audio