Ejemplo n.º 1
0
    def get_mel(self, filename):
        if not self.load_mel_from_disk:
            audio, sampling_rate = load_wav_to_torch(filename)
            if sampling_rate != self.stft.sampling_rate:
                raise ValueError("{} {} SR doesn't match target {} SR".format(
                    sampling_rate, self.stft.sampling_rate))
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)
        else:
            melspec = torch.from_numpy(np.load(filename))
            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))

        return melspec
Ejemplo n.º 2
0
    def __getitem__(self, index):
        filename = self.audio_files[index]
        filename = os.path.join('dataset', filename)
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            print(filename)
            raise ValueError("Sampling rate doesn't math")

        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = F.pad(audio, (0, self.segment_length - audio.size(0)),
                          'constant').data

        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return audio
Ejemplo n.º 3
0
 def get_mel(self, filename, stft):
     try:
         melspec = torch.from_numpy(np.load(filename[:-4] + '_' + str(stft.n_mel_channels) + '.npy'))
         assert melspec.size(0) == self.stft.n_mel_channels, (
             'Mel dimension mismatch: given {}, expected {}'.format(
                 melspec.size(0), self.stft.n_mel_channels))
     except:
         audio, sampling_rate = load_wav_to_torch(filename)
         if sampling_rate != stft.sampling_rate:
             raise ValueError("{} {} SR doesn't match target {} SR".format(
                 sampling_rate, stft.sampling_rate))
         audio_norm = audio / self.max_wav_value
         audio_norm = audio_norm.unsqueeze(0)
         audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
         melspec = stft.mel_spectrogram(audio_norm)
         melspec = torch.squeeze(melspec, 0)
         with open(filename[:-4] + '_' + str(stft.n_mel_channels) + '.npy', 'wb+') as f:
             np.save(f, melspec.numpy())
     return melspec
Ejemplo n.º 4
0
 def get_mel(self, filename):
     if not self.load_mel_from_disk:
         audio, sampling_rate, max_value = load_wav_to_torch(filename)
         if self.audio_offset: # used for extreme GTA'ing
             audio = audio[self.audio_offset:]
         self.max_wav_value = max(max_value, audio.max().item(), -audio.min().item()) # I'm not sure how, but sometimes the magnitude of audio exceeds the max of the datatype used before casting.
         if sampling_rate != self.stft.sampling_rate:
             raise ValueError("{} {} SR doesn't match target {} SR".format(
                 sampling_rate, self.stft.sampling_rate))
         audio_norm = audio / self.max_wav_value
         audio_norm = audio_norm.unsqueeze(0)
         audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
         melspec = self.stft.mel_spectrogram(audio_norm)
         melspec = torch.squeeze(melspec, 0)
     else:
         melspec = torch.from_numpy(np.load(filename, allow_pickle=True)).float()
         assert melspec.size(0) == self.stft.n_mel_channels, (
             'Mel dimension mismatch: given {}, expected {}'.format(
                 melspec.size(0), self.stft.n_mel_channels))
     return melspec
Ejemplo n.º 5
0
 def get_audio(self, filename):
     audio, sampling_rate = load_wav_to_torch(filename)
     if sampling_rate != self.sampling_rate:
         raise ValueError("{} {} SR doesn't match target {} SR".format(
             sampling_rate, self.sampling_rate))
     audio_norm = audio / self.max_wav_value
     audio_norm = audio_norm.unsqueeze(0)
     spec_filename = filename.replace(".wav", ".spec.pt")
     if os.path.exists(spec_filename):
         spec = torch.load(spec_filename)
     else:
         spec = spectrogram_torch(audio_norm,
                                  self.filter_length,
                                  self.sampling_rate,
                                  self.hop_length,
                                  self.win_length,
                                  center=False)
         spec = torch.squeeze(spec, 0)
         torch.save(spec, spec_filename)
     return spec, audio_norm
Ejemplo n.º 6
0
    def get_mel_and_f0(self, filepath):
        audio, sampling_rate = load_wav_to_torch(filepath)
        audio_norm = audio / self.max_wav_value
        # if sampling_rate != self.stft.sampling_rate:
        #     raise ValueError("{} SR doesn't match target {} SR".format(
        #         sampling_rate, self.stft.sampling_rate))
        # audio_norm = audio_norm.unsqueeze(0)
        # melspec = self.stft.mel_spectrogram(audio_norm)
        # melspec = torch.squeeze(melspec, 0)

        melspec = linearspectrogram_torch(audio_norm)  # 用aukit的频谱生成方案

        f0 = self.get_f0(audio.cpu().numpy(), sampling_rate,
                         self.filter_length, self.hop_length, self.f0_min,
                         self.f0_max, self.harm_thresh)
        f0 = torch.from_numpy(f0)[None]
        # f0 = f0[:, :melspec.size(1)]

        # 用零向量替换F0
        # f0 = torch.zeros(1, melspec.shape[1], dtype=torch.float)
        return melspec, f0
Ejemplo n.º 7
0
def get_mel_and_f0(filepath, filter_length, hop_length, win_length, n_mel_channels, sampling_rate, mel_fmin, mel_fmax, f0_min, f0_max, harm_thresh):
    stft = layers.TacotronSTFT(filter_length, hop_length, win_length,
            n_mel_channels, sampling_rate, mel_fmin,
            mel_fmax)
    audio, sampling_rate = load_wav_to_torch(filepath)
    if sampling_rate != stft.sampling_rate:
        raise ValueError("{} SR doesn't match target {} SR".format(
            sampling_rate, stft.sampling_rate))
    audio_norm = audio
    # I changed them to float32 during preprocessing so this normalization is unnecessary.
    audio_norm = audio_norm.unsqueeze(0)
    melspec = stft.mel_spectrogram(audio_norm)
    melspec = torch.squeeze(melspec, 0)

    f0 = get_f0(audio.cpu().numpy(), sampling_rate,
                filter_length, hop_length, f0_min,
                f0_max, harm_thresh)
    f0 = torch.from_numpy(f0)[None]
    f0 = f0[:, :melspec.size(1)]

    return melspec, f0
Ejemplo n.º 8
0
    def __getitem__(self, index):
        # Read audio
        filename = self.audio_files[index]
        audio, sampling_rate = utils.load_wav_to_torch(filename)
        if sampling_rate != self.sampling_rate:
            raise ValueError("{} SR doesn't match target {} SR".format(
                sampling_rate, self.sampling_rate))

            # Take segment
        if audio.size(0) >= self.segment_length:
            max_audio_start = audio.size(0) - self.segment_length
            audio_start = random.randint(0, max_audio_start)
            audio = audio[audio_start:audio_start + self.segment_length]
        else:
            audio = torch.nn.functional.pad(
                audio, (0, self.segment_length - audio.size(0)),
                'constant').data

        mel = self.get_mel(audio)
        audio = utils.mu_law_encode(audio / utils.MAX_WAV_VALUE,
                                    self.mu_quantization)
        return (mel, audio)
Ejemplo n.º 9
0
    def get_mel(self, fb):
        if self.load_mel_from_disk:
            cur_mel_path = os.path.join(self.mel_path, fb + '.npy')
            melspec = np.load(cur_mel_path)
            mean, std = np.load(self.MelStd_mel)
            melspec = (melspec - mean) / std
            melspec = np.transpose(melspec)
            melspec = torch.from_numpy(melspec)
        else:
            cur_audio_path = os.path.join(self.audio_path, fb + '.wav')
            audio = load_wav_to_torch(cur_audio_path, self.sampling_rate)
            audio_norm = audio / self.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            melspec = self.stft.mel_spectrogram(audio_norm)
            melspec = torch.squeeze(melspec, 0)  # [mel_bin, T]

            assert melspec.size(0) == self.stft.n_mel_channels, (
                'Mel dimension mismatch: given {}, expected {}'.format(
                    melspec.size(0), self.stft.n_mel_channels))
        return melspec
Ejemplo n.º 10
0
def prepare_mel_npy(hparams, audiopath_and_text):

    audiopath_and_texts = load_filepaths_and_text(audiopath_and_text)

    for i in range(len(audiopath_and_texts)):
        audiopath, text, speaker_id = audiopath_and_texts[i][0], audiopath_and_texts[i][1], audiopath_and_texts[i][2]
        audio, sampling_rate = load_wav_to_torch(audiopath)

        audio_norm = audio / hparams.max_wav_value
        audio_norm = audio_norm.unsqueeze(0)
        audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
        melspec = layers.TacotronSTFT(hparams).mel_spectrogram(audio_norm)
        melspec = torch.squeeze(melspec, 0)

        out_dir = audiopath[:11]
        file_name = audiopath[12:-4]

        file = os.path.join(out_dir, file_name)

        np.save(file, melspec)
        print("{} / {}".format(i,len(audiopath_and_texts)))
    pass
Ejemplo n.º 11
0
def dnp(run_name,
        noisy_file,
        samples_dir,
        LR=0.001,
        num_iter=5000,
        save_every=50):

    # initiate model
    nlayers = 6
    model = Unet(nlayers=nlayers, nefilters=60).cuda()
    samples_dir = os.path.join(samples_dir, run_name)
    utils.makedirs(samples_dir)
    # load data
    target, sr = utils.load_wav_to_torch(noisy_file)
    target = target[:(len(target) // 2**nlayers) * 2**nlayers]
    target = target / utils.MAX_WAV_VALUE
    input = torch.rand_like(target)
    input = (input - 0.5) * 2
    target, input = target.cuda(), input.cuda()
    criterion = torch.nn.MSELoss()

    optimize(model.parameters(), model, criterion, input, target, samples_dir,
             LR, num_iter, sr, save_every)
Ejemplo n.º 12
0
                        help='Directory to put Mel-Spectrogram Tensors')
    parser.add_argument('-c',
                        '--config',
                        type=str,
                        help='JSON file for configuration')

    args = parser.parse_args()

    filepaths = utils.files_to_list(args.audio_list)

    # Make directory if it doesn't exist
    if not os.path.isdir(args.output_dir):
        os.makedirs(args.output_dir)
        os.chmod(args.output_dir, 0o775)

    # Parse config.  Only using data processing
    with open(args.config) as f:
        data = f.read()
    config = json.loads(data)
    data_config = config["data_config"]
    mel_factory = Mel2SampOnehot(**data_config)

    for filepath in filepaths:
        audio, sampling_rate = utils.load_wav_to_torch(filepath)
        assert (sampling_rate == mel_factory.sampling_rate)
        melspectrogram = mel_factory.get_mel(audio)
        filename = os.path.basename(filepath)
        new_filepath = args.output_dir + '/' + filename + '.pt'
        print(new_filepath)
        torch.save(melspectrogram, new_filepath)
Ejemplo n.º 13
0
 def get_mel(self, filename):
     audio = load_wav_to_torch(filename, self.sampling_rate)
     mel = self.mel_transform_fn.transform(audio.view(1, -1))
     return mel
Ejemplo n.º 14
0
    hparams = create_hparams()
    stft = layers.TacotronSTFT(hparams.filter_length, hparams.hop_length,
                               hparams.win_length, hparams.n_mel_channels,
                               hparams.sampling_rate, hparams.mel_fmin,
                               hparams.mel_fmax)

    if gen_mel:
        audio_files = sorted(glob('audio/*.wav'))
        out_dir = 'mel'
        SaveMkdir(out_dir)
        for file in tqdm(audio_files):
            tqdm.write(file)
            file_basename = os.path.basename(file).split('.')[0]
            audio_path = os.path.join(hparams.audio_path,
                                      file_basename + '.wav')
            audio, sampling_rate = load_wav_to_torch(audio_path,
                                                     hparams.sampling_rate)
            audio_norm = audio / hparams.max_wav_value
            audio_norm = audio_norm.unsqueeze(0)
            audio_norm = torch.autograd.Variable(audio_norm,
                                                 requires_grad=False)
            melspec = stft.mel_spectrogram(audio_norm)
            #转置存错 即数据行代表帧 列代表特征
            melspec = torch.squeeze(melspec, 0).numpy().transpose()

            out_file = os.path.join(out_dir, file_basename + '.npy')
            np.save(out_file, melspec)

        mean_std = cal_MeanStd(out_dir, hparams.n_mel_channels, ref_file=None)
        np.save(os.path.join(out_dir, os.pardir, 'MeanStd_Tacotron_mel.npy'),
                mean_std)
Ejemplo n.º 15
0
 def __getitem__(self, index):
     item = self.audios[index]
     return load_wav_to_torch(item)[0]