Example #1
0
    def _get_next_example(self):
        """Gets a single example (input, mel_target, token_target, linear_target, mel_length) from_ disk
		"""
        if self._train_offset >= len(self._train_meta):
            self._train_offset = 0
            np.random.shuffle(self._train_meta)

        meta = self._train_meta[self._train_offset]
        self._train_offset += 1

        text = meta[5]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        # mel_target = np.load(meta[1])
        # audio-P00001A-001-20170001P00001A0001_00.npy
        # E:/data/stcmds/SV2TTS/synthesizer/audio/audio-P00001A-001-20170001P00001A0001_00.npy
        # wav_fpath = Path(r'E:\data\stcmds\stcmds\wavs')
        parts = Path(meta[0]).parts
        name_parts = parts[-1].split('-')
        name_parts[-1] = name_parts[-1].split('_')[0] + '.wav'
        wav_fpath = Path(*parts[:3], parts[2], 'wavs', *name_parts[1:])

        wav = load_wav(wav_fpath, hparams.sample_rate)
        if hparams.rescale:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max
        mel_target = melspectrogram(wav, hparams).T
        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        embed_target = np.load(meta[2])
        return input_data, mel_target, token_target, embed_target, len(
            mel_target)
Example #2
0
    def _get_test_groups(self):
        meta = self._test_meta[self._test_offset]
        self._test_offset += 1

        text = meta[5]

        input_data = np.asarray(text_to_sequence(text, self._cleaner_names),
                                dtype=np.int32)
        # mel_target = np.load(meta[1])  # os.path.join(self._mel_dir, meta[1])

        parts = Path(meta[0]).parts
        name_parts = parts[-1].split('-')
        name_parts[-1] = name_parts[-1].split('_')[0] + '.wav'
        wav_fpath = Path(*parts[:3], parts[2], 'wavs', *name_parts[1:])

        wav = load_wav(wav_fpath, hparams.sample_rate)
        if hparams.rescale:
            wav = wav / np.abs(wav).max() * hparams.rescaling_max
        mel_target = melspectrogram(wav, hparams).T

        #Create parallel sequences containing zeros to represent a non finished sequence
        token_target = np.asarray([0.] * (len(mel_target) - 1))
        embed_target = np.load(
            meta[2])  # os.path.join(self._embed_dir, meta[2])
        return input_data, mel_target, token_target, embed_target, len(
            mel_target)
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
    if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
        wav = load_preprocess_wav(fpath_or_wav)
    else:
        wav = fpath_or_wav

    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    return mel_spectrogram
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
                      skip_existing: bool, hparams):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    ppg_fpath = out_dir.joinpath("ppgs", "ppg-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None

    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # Skip utterances that are too long
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Compute ppg
    wav_ppg = (wav * 32767).astype(np.int16)
    if hparams.use_full_ppg:
        ppg = audio.get_ppg(wav_ppg, hparams.sample_rate,
                            hparams.hop_size / hparams.sample_rate * 1000)
    else:
        ppg = audio.get_monophone_ppg(
            wav_ppg, hparams.sample_rate,
            hparams.hop_size / hparams.sample_rate * 1000)
    ppg_frames = ppg.shape[0]

    # Sometimes ppg can be 1 frame longer than mel
    min_frames = min(mel_frames, ppg_frames)
    mel_spectrogram = mel_spectrogram[:, :min_frames]
    ppg = ppg[:min_frames, :]

    # Write the spectrogram, embed, ppg and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)
    np.save(ppg_fpath, ppg, allow_pickle=False)

    # Return a tuple describing this training example
    return wav_fpath.name, mel_fpath.name, ppg_fpath.name, "embed-%s.npy" % basename, len(
        wav), min_frames, text
    def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
        """
        Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that 
        were fed to the synthesizer when training.
        """
        if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
            wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
        else:
            wav = fpath_or_wav

        mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
        return mel_spectrogram
Example #6
0
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
                      skip_existing: bool, hparams):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None
    # rescale
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    # denoise LogMMSE
    #from utils import logmmse
    #wav = logmmse.denoise(wav, profile, eta=0)

    # VAD process
    from encoder.audio import trim_long_silences, normalize_volume
    wav = normalize_volume(wav, -30, increase_only=True)
    wav = trim_long_silences(wav)

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        #print("too short!")
        return None

    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # Skip utterances that are too long
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)

    # Return a tuple describing this training example
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(
        wav), mel_frames, text
Example #7
0
def process_utterance(wav: np.ndarray, text: str, out_dir: Path, basename: str,
                      skip_existing: bool, hparams):
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.
    """
    wav: one wav file of one complete sentence from alignment file from split_on_silence function
    text: text of that wav file
    out_dir: where to save spcetro and audio in npy format
    basename: name of file

    skip existing if found -> required
    skip if sentence too short -> not required
    compute spectro -> required
    skip if sentence too long -> not required
    save spectro and audio -> required
    """

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None

    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(
        np.float32)  #NEED THIS
    mel_frames = mel_spectrogram.shape[1]

    # Skip utterances that are too long                                     #DONT NEED THIS
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)  #NEED THIS
    np.save(wav_fpath, wav, allow_pickle=False)  #NEED THIS

    # Return a tuple describing this training example
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % basename, len(
        wav), mel_frames, text
def process_audio_file(vfile, args, gpu_id):
	fulldir = vfile.replace('intervals', 'preprocessed') #windows下需要改正路径,这里win下是\\
	fulldir = fulldir[:fulldir.rfind('.')] # ignore extension
	os.makedirs(fulldir, exist_ok=True)

	wavpath = path.join(fulldir, 'audio.wav')
	specpath = path.join(fulldir, 'mels.npz')

	
	wav = audio.load_wav(wavpath, hp.sample_rate)
	spec = audio.melspectrogram(wav, hp)
	lspec = audio.linearspectrogram(wav, hp)
	np.savez_compressed(specpath, spec=spec, lspec=lspec)
Example #9
0
def process_utterance(wav: np.ndarray,
                      text: str,
                      out_dir: Path,
                      basename: str,
                      skip_existing: bool,
                      hparams,
                      random_uttBasename_forSpkEmbedding=None):
    '''
    random_uttBasename_forSpkEmbedding: if not None, use the utterance to generate speaker embedding in synthesizer training.
    '''
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None

    # Compute the mel spectrogram
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # Skip utterances that are too long
    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram.T, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)

    # Return a tuple describing this training example
    embed_basename = basename
    if random_uttBasename_forSpkEmbedding is not None:
        embed_basename = random_uttBasename_forSpkEmbedding
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % embed_basename, len(
        wav), mel_frames, text
Example #10
0
def process_video_file(vfile, args, gpu_id):
    video_stream = cv2.VideoCapture(vfile)

    frames = []
    while 1:
        still_reading, frame = video_stream.read()
        if not still_reading:
            video_stream.release()
            break
        frames.append(frame)

    fulldir = vfile.replace('/intervals/', '/preprocessed/')
    fulldir = vfile[:vfile.rfind('.')]  # ignore extension

    os.makedirs(fulldir, exist_ok=True)

    wavpath = path.join(fulldir, 'audio.wav')
    specpath = path.join(fulldir, 'mels.npz')

    command = template.format(vfile, hp.sample_rate, wavpath)
    subprocess.call(command, shell=True)

    wav = audio.load_wav(wavpath, hp.sample_rate)
    spec = audio.melspectrogram(wav, hp)
    lspec = audio.linearspectrogram(wav, hp)
    np.savez_compressed(specpath, spec=spec, lspec=lspec)

    batches = [
        frames[i:i + args.batch_size]
        for i in range(0, len(frames), args.batch_size)
    ]

    i = -1
    for fb in batches:
        preds = fa[gpu_id].get_detections_for_batch(np.asarray(fb))

        for j, f in enumerate(preds):
            i += 1
            if f is None:
                continue

            cv2.imwrite(path.join(fulldir, '{}.jpg'.format(i)), f[0])
Example #11
0
def wav2mel(wav):
    return melspectrogram(wav, hparams)
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
	Preprocesses a single utterance wav/text pair

	this writes the mel scale spectogram to disk and return a tuple to write
	to the train.txt file

	Args:
		- mel_dir: the directory to write the mel spectograms into
		- linear_dir: the directory to write the linear spectrograms into
		- wav_dir: the directory to write the preprocessed wav into
		- index: the numeric index to use in the spectogram filename
		- wav_path: path to the audio file containing the speech input
		- text: text spoken in the input audio file
		- hparams: hyper parameters

	Returns:
		- A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
	"""
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path, sr=hparams.sample_rate)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    #Pre-emphasize
    wav = audio.preemphasis(wav, hparams.preemphasis, hparams.preemphasize)

    #rescale wav
    if hparams.rescale:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #Assert all audio is in [-1, 1]
    if (wav > 1.).any() or (wav < -1.).any():
        raise RuntimeError('wav has invalid value: {}'.format(wav))

    #M-AILABS extra silence specific
    if hparams.trim_silence:
        wav = audio.trim_silence(wav, hparams)
        out = wav
        constant_values = 0.
        out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.linearspectrogram(wav,
                                                 hparams).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    if hparams.use_lws:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        fft_size = hparams.n_fft if hparams.win_size is None else hparams.win_size
        l, r = audio.pad_lr(wav, fft_size, audio.get_hop_size(hparams))

        #Zero pad audio signal
        out = np.pad(out, (l, r),
                     mode='constant',
                     constant_values=constant_values)
    else:
        #Ensure time resolution adjustement between audio and mel-spectrogram
        l_pad, r_pad = audio.librosa_pad_lr(wav, hparams.n_fft,
                                            audio.get_hop_size(hparams))

        #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
        out = np.pad(out, (l_pad, r_pad),
                     mode='constant',
                     constant_values=constant_values)

    assert len(out) >= mel_frames * audio.get_hop_size(hparams)

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * audio.get_hop_size(hparams)]
    assert len(out) % audio.get_hop_size(hparams) == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    embed_filename = 'embed-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, embed_filename,
            time_steps, mel_frames, text)
Example #13
0
            e1 = encoder.embed_utterance(source_wav)
            e1 = e1[np.newaxis, :, np.newaxis]
            e1=torch.tensor(e1)
            for t in target:
                target_wav_name = os.listdir(en_path + t + "/wavs")
                embedding_tr = 0
                for i in range(10):
                    target_name = target_wav_name[i]
                    target_wav_fpath = en_path +t+"/wavs"+"/"+ target_name
                    target_wav = encoder.preprocess_wav(target_wav_fpath)
                    e2 = encoder.embed_utterance(target_wav)
                    embedding_tr = embedding_tr+ e2
                embedding_tr /=10

                print(embedding_tr.shape) 
                mel = audio.melspectrogram(source_wav, hparams)
                mel = pad_seq(mel.T).T
                mel = torch.from_numpy(mel[None, ...])

                embedding_tr =  embedding_tr[np.newaxis, :, np.newaxis]
                embedding_tr =torch.tensor(embedding_tr)
                mel,e1,embedding_tr = mel.cuda(),e1.cuda(),embedding_tr.cuda()
                #print("mel shape:",mel.shape)
                #print("e1 shape:",e1.shape)
                #print("e2 shape:",e2.shape)

                C,X_C,X_before,X_after,_ = model(mel, e1, embedding_tr)
                mel_out = torch.tensor(X_after).clone().detach().cpu().numpy()
                #print("mel_out shape:",mel_out.shape)
                if use_wavrnn:
                    wav = vocoder_wavrnn.infer_waveform(mel_out[0,0,:,:].T)