Exemple #1
0
def _process_utterance(out_dir, index, wav_path, text):
  '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
  mel_filename = 'ljspeech-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #2
0
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #3
0
def _process_utterance(out_dir, index, fname, wav_path, text, label):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'emotion-spec-%05d.npy' % index
    mel_filename = 'emtion-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (fname, spectrogram_filename, mel_filename, n_frames, text, label)
Exemple #4
0
def preprocess(data_dir, normalized_dir):
    # Ingest the text file:
    #normalize(data_dir,  normalized_dir)
    wav_files = sorted(Path(data_dir).glob('**/*.wav'))
    train_tuples = []
    for wav_file in wav_files:
        # Get the transcript
        text_filepath = (wav_file.parent
                     / ('-'.join(wav_file.stem.split('-')[:-1])
                       + '.trans.txt'))
        text = ''
        with text_filepath.open() as text_file:
            for line in text_file:
                split_line = line.split()
                if split_line[0] == wav_file.name:
                    text = ' '.join(split_line[1:])
                    break
        
        wav = audio.load_wav(str(wav_file))

        spectrogram = audio.spectrogram(wav).astype(np.float32)
        spectrogram_filename = (wav_file.stem + '_spectrogram')
        mel_spectrogram_filename = (wav_file.stem + '_mel_spectrogram')

        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
        np.save(os.path.join(normalized_dir,spectrogram_filename),
                spectrogram.T,
                allow_pickle=False)
        np.save(os.path.join(normalized_dir,mel_spectrogram_filename),
                mel_spectrogram.T,
                allow_pickle=False)
        train_tuples.append((spectrogram_filename, mel_spectrogram_filename,
                spectrogram.shape[1],
                text))
        print('DOne??')
        with open(os.path.join(normalized_dir, 'train.txt'), 'w') as train_file:
            for train_tuple in train_tuples:
                train_file.write('|'.join([str(x) for x in train_tuple]) + '\n')
Exemple #5
0
def _process_utterance(out_dir, index, wav_path, labels_path, text):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    start_offset, end_offset = _parse_labels(labels_path)
    start = int(start_offset * hparams.sample_rate)
    end = int(end_offset *
              hparams.sample_rate) if end_offset is not None else -1
    wav = wav[start:end]
    max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    if len(wav) > max_samples:
        return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'blizzard-spec-%05d.npy' % index
    mel_filename = 'blizzard-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #6
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Pre-processa um unico par de audio / texto de enunciado.

  Isso grava os espectrogramas mel e linear em disco e retorna uma tupla para gravar no arquivo train.txt.

  Args:
    out_dir: O diretorio para gravar os espectrogramas
    index: O indice numerico a ser usado nos nomes de arquivos do espectrograma.
    wav_path: Caminho para o arquivo de audio que contem a entrada de fala
    text: O texto falado no arquivo de audio de entrada

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Carregue o áudio em uma matriz numpy:
    wav = audio.load_wav(wav_path)

    # Calcule o espectrograma em escala linear a partir do wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Calcule um espectrograma em escala de mel a partir do wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Escreva os espectrogramas no disco:
    spectrogram_filename = 'ptbr-spec-%05d.npy' % index
    mel_filename = 'ptbr-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Retorne uma tupla descrevendo este exemplo de treinamento:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #7
0
def _process_utterance(out_dir, index, wav_path, pinyin):
  '''Preprocesses a single utterance audio/text pair.
  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.
  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file
  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # rescale wav for unified measure for all clips
  wav = wav / np.abs(wav).max() * 0.999

  # trim silence
  wav = audio.trim_silence(wav)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'biaobei-spec-%05d.npy' % index
  mel_filename = 'biaobei-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, pinyin)
Exemple #8
0
def _process_utterance(out_dir, index, wav_path, text):

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'nawar-spec-%05d.npy' % index
    mel_filename = 'nawar-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Exemple #9
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    # 读取语音
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * 0.999

    # 消除静音
    wav = audio.trim_silence(wav)

    # 得到语音的线性频谱和梅尔频谱
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # 保存两种频谱
    spectrogram_filename = 'biaobei-spec-%05d.npy' % index
    mel_filename = 'biaobei-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return (spectrogram_filename, mel_filename, n_frames, pinyin)
Exemple #10
0
def _process_utterance(out_dir, index, wav_path, text, pml_cmp, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file
    pml_cmp: One dimensional array containing vocoder features read from .cmp file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Write the PML features to disk
    pml_filename = 'nick-pml-%05d.npy' % index
    pml_dimension = hparams.pml_dimension
    pml_features = pml_cmp.reshape((-1, pml_dimension))
    pml_frames = pml_features.shape[0]
    np.save(os.path.join(out_dir, pml_filename),
            pml_features,
            allow_pickle=False)

    # Remove silence from the wav
    # silence_removed = audio.remove_silence(wav)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    # Ensure lengths of spectrograms and PML features are the same
    if n_frames > pml_frames:
        spectrogram = spectrogram[:, :pml_frames]

    # Check the shape of the mel target
    if mel_frames > pml_frames:
        mel_spectrogram = mel_spectrogram[:, :pml_frames]

    # Write the spectrograms to disk:
    spectrogram_filename = 'nick-spec-%05d.npy' % index
    mel_filename = 'nick-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, pml_filename,
            pml_frames, text)
Exemple #11
0
from util import audio

point_of_interest = '001'

# read from Nick
CMP = np.fromfile(
    f'/home/josh/tacotron/Nick/pml/herald_{point_of_interest}_1.cmp',
    dtype=np.float32)
# pml_features = nick_parameters.reshape()
f = open(f'/home/josh/tacotron/Nick/txt/herald_{point_of_interest}_1.txt', 'r')

# Load the audio to a numpy array:
wav = audio.load_wav(
    f'/home/josh/tacotron/Nick/wav/herald_{point_of_interest}_1.wav')

# Compute the linear-scale spectrogram from the wav:
spectrogram = audio.spectrogram(wav).astype(np.float32)

# try to reshape the Nick PML features into an 86 x something matrix
# pml_dimension = 86
pml_dimension = 163
pml_features = CMP.reshape((-1, pml_dimension))

# Get the saved spectrogram
linear_target = np.load(f'/home/josh/tacotron/training/nick-spec-00605.npy')

print(f.read())
print('PML Feature Info', CMP.shape, CMP.size / pml_dimension,
      pml_features.shape)
print('Audio Info', spectrogram.shape, linear_target.shape, wav.shape)
Exemple #12
0
def _process_utterance(mel_dir, linear_dir, wav_dir, index, wav_path, text,
                       hparams):
    """
    Preprocesses a single utterance wav/text pair

    this writes the mel scale spectogram to disk and return a tuple to write
    to the train.txt file

    Args:
        - mel_dir: the directory to write the mel spectograms into
        - linear_dir: the directory to write the linear spectrograms into
        - wav_dir: the directory to write the preprocessed wav into
        - index: the numeric index to use in the spectogram filename
        - wav_path: path to the audio file containing the speech input
        - text: text spoken in the input audio file
        - hparams: hyper parameters

    Returns:
        - A tuple: (audio_filename, mel_filename, linear_filename, time_steps, mel_frames, linear_frames, text)
    """
    try:
        # Load the audio as numpy array
        wav = audio.load_wav(wav_path)
    except FileNotFoundError:  #catch missing wav exception
        print(
            'file {} present in csv metadata is not present in wav folder. skipping!'
            .format(wav_path))
        return None

    # M-AILABS extra silence specific
    wav = audio.trim_silence(wav)

    #Pre-emphasize
    wav = audio.preemphasis(wav)

    #rescale wav
    #wav = wav / np.abs(wav).max() * hparams.rescaling_max

    #Assert all audio is in [-1, 1]
    if (wav > 1.).any() or (wav < -1.).any():
        #raise RuntimeError('wav has invalid value: {}'.format(wav))
        print('file {} has invalid value. skipping!'.format(wav_path))
        return None

    #[-1, 1]
    out = wav
    constant_values = 0.
    out_dtype = np.float32

    # Compute the mel scale spectrogram from the wav
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_frames = mel_spectrogram.shape[1]

    if mel_frames > hparams.max_frame_num:
        return None

    #Compute the linear scale spectrogram from the wav
    linear_spectrogram = audio.spectrogram(wav).astype(np.float32)
    linear_frames = linear_spectrogram.shape[1]

    #sanity check
    assert linear_frames == mel_frames

    #Ensure time resolution adjustement between audio and mel-spectrogram

    l_pad, r_pad, hop_size = audio.librosa_pad_lr(wav)

    #Reflect pad audio signal on the right (Just like it's done in Librosa to avoid frame inconsistency)
    out = np.pad(out, (l_pad, r_pad),
                 mode='constant',
                 constant_values=constant_values)

    assert len(out) >= mel_frames * hop_size

    #time resolution adjustement
    #ensure length of raw audio is multiple of hop size so that we can use
    #transposed convolution to upsample
    out = out[:mel_frames * hop_size]
    assert len(out) % hop_size == 0
    time_steps = len(out)

    # Write the spectrogram and audio to disk
    audio_filename = 'audio-{}.npy'.format(index)
    mel_filename = 'mel-{}.npy'.format(index)
    linear_filename = 'linear-{}.npy'.format(index)
    np.save(os.path.join(wav_dir, audio_filename),
            out.astype(out_dtype),
            allow_pickle=False)
    np.save(os.path.join(mel_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(linear_dir, linear_filename),
            linear_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example
    return (audio_filename, mel_filename, linear_filename, time_steps,
            mel_frames, text)
def _process_utterance(out_dir, fid, wav_path, label_path, phn_seq,
                       phn2idx_map, frame_phn_idx, trim_silence, audio_config,
                       f0_config):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
        out_dir: The directory to write the spectrograms into
        fid: utterance id.
        wav_path: Path to the audio file containing the speech input
        label_path: Path to full label file.
        phn_seq: text-phn-seq from ppp file.
        phn2idx_map: map phn (str) to index (int).
        frame_phn_idx: frame-level text-phn positions.
        uttid: utterance id.
        trim_silence:
        audio_config:
        f0_config:
    Returns:
        tuple (expanded phn-seq length, fid)
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, audio_config.sample_rate)

    # Trim Slience
    if trim_silence:
        start_offset, end_offset = _parse_labels(label_path)
        start_offset_frames = int(start_offset /
                                  (audio_config.frame_shift_ms / 1000.0))
        end_offset_frames = int(end_offset /
                                (audio_config.frame_shift_ms / 1000.0))
    else:
        start_offset_frames = 0
        end_offset_frames = 1000000

    # Compute a mel-scale spectrogram for wavenet, wavernn
    mel_spectrogram = melspectrogram(
        wave=wav,
        sample_rate=audio_config.sample_rate,
        num_freq=audio_config.num_freq,
        num_mels=audio_config.num_mels,
        frame_length_ms=audio_config.frame_length_ms,
        frame_shift_ms=audio_config.frame_shift_ms,
        pre_emphasis=audio_config.pre_emphasis,
        fmin=audio_config.fmin,
        min_level_db=audio_config.min_level_db,
        ref_level_db=audio_config.ref_level_db).astype(np.float32)
    mel_spectrogram = mel_spectrogram.T  # -> [T, num_mels]
    stft = spectrogram(wave=wav,
                       sample_rate=audio_config.sample_rate,
                       num_freq=audio_config.num_freq,
                       frame_length_ms=audio_config.frame_length_ms,
                       frame_shift_ms=audio_config.frame_shift_ms,
                       pre_emphasis=audio_config.pre_emphasis,
                       min_level_db=audio_config.min_level_db,
                       ref_level_db=audio_config.ref_level_db).astype(
                           np.float32)
    stft = stft.T  # -> [T, num_freq]
    cont_f0 = compute_f0_from_wav(speaker_name='song_guiniang',
                                  wave_path=wav_path,
                                  f0_config=f0_config)

    if len(cont_f0) > len(mel_spectrogram):
        cont_f0 = cont_f0[:len(mel_spectrogram)]
    elif len(cont_f0) < len(mel_spectrogram):
        pad_len = len(mel_spectrogram) - len(cont_f0)
        cont_f0 = np.pad(cont_f0, (0, pad_len),
                         mode="constant",
                         constant_values=cont_f0[-1])
    else:
        pass

    # Process phn sequence
    phn_seq_idx = np.asarray([phn2idx_map[i] for i in phn_seq])
    dur = np.asarray([
        len(list(group)) for (key, group) in itertools.groupby(frame_phn_idx)
    ])
    pos_list = [key for (key, group) in itertools.groupby(frame_phn_idx)]
    txt_phnseq = np.array(phn_seq)[pos_list]

    # Cut off tailing silence
    cut_len = mel_spectrogram.shape[0] - sum(dur)
    if cut_len > 0:
        mel_spectrogram = mel_spectrogram[:-cut_len]
        stft = stft[:-cut_len]
        cont_f0 = cont_f0[:-cut_len]
    else:
        assert dur[-1] > -cut_len, "Not enough tailing silence."
        dur[-1] = dur[-1] + cut_len

    # Save features
    with open(f"{out_dir}/{fid}.phntone.txt", 'w') as f:
        f.write(' '.join(txt_phnseq) + '\n')
    np.save(f'{out_dir}/{fid}.mel.npy', mel_spectrogram, allow_pickle=False)
    np.save(f'{out_dir}/{fid}.stft.npy', stft, allow_pickle=False)
    np.save(f'{out_dir}/{fid}.dur.npy', dur, allow_pickle=False)
    np.save(f'{out_dir}/{fid}.f0.npy', cont_f0, allow_pickle=False)
    return len(frame_phn_idx), fid