Esempio n. 1
0
def _process_utterance(out_dir, index, tar_cd_path, in_jd_path, in_cg_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    tar_cd_wav = audio.load_wav(tar_cd_path)

    # Compute the linear-scale spectrogram from the wav:
    tar_cd_spectrogram = audio.spectrogram(tar_cd_wav).astype(np.float32)
    n_frames = tar_cd_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    tar_cd_mel_spectrogram = audio.melspectrogram(tar_cd_wav).astype(
        np.float32)

    in_jd_wav = audio.load_wav(in_jd_path)
    in_cg_wav = audio.load_wav(in_cg_path)

    # Compute the linear-scale spectrogram from the wav:
    # Beacase of use voice traing,needless spectrogram.
    #in_spectrogram = audio.spectrogram(in_cg_wav).astype(np.float32)

    # Compute the mel-scale spectrogram from the wav:
    in_jd_mel_spectrogram = audio.melspectrogram(in_jd_wav).astype(np.float32)
    in_cg_mel_spectrogram = audio.melspectrogram(in_cg_wav).astype(np.float32)

    # Write the spectrograms to disk:
    in_jd_mel_spectrogram_filename = 'Imuspeech-in_jd_mel_spec-%05d.npy' % index
    in_cg_mel_spectrogram_filename = 'Imuspeech-in_cg_mel_spec-%05d.npy' % index
    tar_cd_spectrogram_filename = 'Imuspeech-tar_cd_spec-%05d.npy' % index
    tar_cd_mel_filename = 'Imuspeech-tar_cd_mel-%05d.npy' % index

    np.save(os.path.join(out_dir, in_jd_mel_spectrogram_filename),
            in_jd_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, in_cg_mel_spectrogram_filename),
            in_cg_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tar_cd_spectrogram_filename),
            tar_cd_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tar_cd_mel_filename),
            tar_cd_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (tar_cd_spectrogram_filename, tar_cd_mel_filename, n_frames,
            in_jd_mel_spectrogram_filename, in_cg_mel_spectrogram_filename)
Esempio n. 2
0
def _process_utterance(out_dir, index, src_path, tgt_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    src_path: Path to the source audio file
    tgt_path: Path to the target audio file

  Returns:
    A (tgt_spectrogram_filename, tgt_mel_filename, n_frames, src_spectogram_filename) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    src_wav = audio.load_wav(src_path)
    tgt_wav = audio.load_wav(tgt_path)

    # Compute the linear-scale spectrogram from the wav:
    src_spectrogram = audio.spectrogram(
        src_wav,
        num_src_freq=hparams.num_src_freq,
        frame_length_ms=hparams.src_frame_length_ms).astype(np.float32)
    src_n_frames = src_spectrogram.shape[1]
    tgt_spectrogram = audio.spectrogram(tgt_wav).astype(np.float32)
    tgt_n_frames = tgt_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    src_mel_spectrogram = audio.melspectrogram(src_wav).astype(np.float32)
    tgt_mel_spectrogram = audio.melspectrogram(tgt_wav).astype(np.float32)

    # Write the spectrograms to disk:
    src_spectrogram_filename = 'wav2wav_src-spec-%05d.npy' % index
    src_mel_filename = 'wav2wav_src-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, src_spectrogram_filename),
            src_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, src_mel_filename),
            src_mel_spectrogram.T,
            allow_pickle=False)

    tgt_spectrogram_filename = 'wav2wav_tgt-spec-%05d.npy' % index
    tgt_mel_filename = 'wav2wav_tgt-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, tgt_spectrogram_filename),
            tgt_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, tgt_mel_filename),
            tgt_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (tgt_spectrogram_filename, tgt_mel_filename, tgt_n_frames,
            src_spectrogram_filename)
def _process_utterance(out_dir, index, wav_path_neutral, wav_path_happy):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

    Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''
    # Load the audio to a numpy array:
    wav1 = audio.load_wav(wav_path_neutral)
    wav2 = audio.load_wav(wav_path_happy)

    # Compute the neutral linear-scale spectrogram from the wav:
    spectrogram_neutral = audio.spectrogram(wav1).astype(np.float32)
    n_frames = spectrogram_neutral.shape[1]
    # Compute a neutral mel-scale spectrogram from the wav:
    mel_spectrogram_neutral = audio.melspectrogram(wav1).astype(np.float32)

    spectrogram_happy = audio.spectrogram(wav2).astype(np.float32)
    n_frames = spectrogram_happy.shape[1]
    mel_spectrogram_happy = audio.melspectrogram(wav2).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_neutral_filename = 'neutral-spec-%05d.npy' % index
    mel_neutral_filename = 'neutral-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_neutral_filename),
            spectrogram_neutral.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_neutral_filename),
            mel_spectrogram_neutral.T,
            allow_pickle=False)

    spectrogram_happy_filename = 'happy-spec-%05d.npy' % index
    mel_happy_filename = 'happy-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_happy_filename),
            spectrogram_happy.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_happy_filename),
            mel_spectrogram_happy.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_neutral_filename, mel_neutral_filename,
            spectrogram_happy_filename, mel_happy_filename, n_frames)
Esempio n. 4
0
def _process_utterance(out_dir, index, source_wav_path, target_wav_path):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    source_wav = audio.load_wav(source_wav_path)
    target_wav = audio.load_wav(target_wav_path)

    # Compute the linear-scale spectrogram from the wav:
    target_spectrogram = audio.spectrogram(target_wav).astype(np.float32)
    n_frames = target_spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    source_mel_spectrogram = audio.melspectrogram(source_wav).astype(
        np.float32)
    target_mel_spectrogram = audio.melspectrogram(target_wav).astype(
        np.float32)

    # Write the spectrograms to disk:
    #source_spectrogram_filename = 'source-spec-%05d.npy' % index
    source_mel_filename = 'source-mel-%05d.npy' % index
    target_spectrogram_filename = 'target-spec-%05d.npy' % index
    target_mel_filename = 'target-mel-%05d.npy' % index
    #np.save(os.path.join(out_dir, source_spectrogram_filename), source_spectrogram.T, allow_pickle=False)

    np.save(os.path.join(out_dir, source_mel_filename),
            source_mel_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, target_spectrogram_filename),
            target_spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, target_mel_filename),
            target_mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (source_mel_filename, n_frames, target_spectrogram_filename,
            target_mel_filename)
Esempio n. 5
0
def run_eval(args):
  #print(hparams_debug_string())
  is_teacher_force = False
  reference_mel = None

  synth = Synthesizer(teacher_forcing_generating=is_teacher_force)
  synth.load(args.model, args.reference)
  base_path = get_output_base_path(args.model)

  if args.reference is not None:
    ref_wav = audio.load_wav(args.reference)
    reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
    #path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference))[0])
    path = 'ref-%s.wav' % (os.path.splitext(os.path.basename(args.reference))[0])
  else:
      raise ValueError("You must set the reference audio.")

  
  with open('examples_test.txt', 'r') as fs:
   
      lines = fs.readlines()
      for i, line in enumerate(lines):
          args.text = line.strip().split('|')[-1]          
          
          path_id = '%d_' %(i+6)
          new_path = path_id + path
          print('Synthesizing: %s' % args.text)
          print('Output wav file: %s' % new_path)
          
          with open(new_path, 'wb') as f:
            f.write(synth.synthesize(args.text, reference_mel=reference_mel))
Esempio n. 6
0
def _process_utterance(out_dir,
                       index,
                       wav_path,
                       labels_path,
                       text,
                       person_id=1):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    start_offset, end_offset = _parse_labels(labels_path)
    start = int(start_offset * hparams.sample_rate)
    end = int(end_offset *
              hparams.sample_rate) if end_offset is not None else -1
    wav = wav[start:end]
    max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    if len(wav) > max_samples:
        return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'blizzard-spec-%05d.npy' % index
    mel_filename = 'blizzard-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text, person_id)
Esempio n. 7
0
def _process_utterance(out_dir, index, wav_path, text):
  '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
  mel_filename = 'ljspeech-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 8
0
def process_utterance(out_path, index, wav_path, text):
    '''
    generate linear and mel scale spectrograms for each text, wav pairs
    and save the np array into disk

    return the file names of the np array files
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index

    # .T: transpose of narray
    # allow_pickle: for security and portability not allow
    np.save(os.path.join(out_path, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_path, mel_filename), mel_spectrogram.T, allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 9
0
def _process_utterance(out_dir, prompt_id, wav_path, text):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Trim leading and trailing silence:
    margin = int(hparams.sample_rate * 0.1)
    wav = wav[margin:-margin]
    wav, _ = librosa.effects.trim(wav,
                                  top_db=40,
                                  frame_length=1024,
                                  hop_length=256)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'amy-spec-%s.npy' % prompt_id
    mel_filename = 'amy-mel-%s.npy' % prompt_id
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 10
0
def _process_utterance(out_dir, index, wav_path, pinyin):
  '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    pinyin: The pinyin of Chinese spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

  # Load the audio to a numpy array:
  wav = audio.load_wav(wav_path)

  # Compute the linear-scale spectrogram from the wav:
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]

  # Compute a mel-scale spectrogram from the wav:
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

  # Write the spectrograms to disk:
  spectrogram_filename = 'femalemandarin-spec-%05d.npy' % index
  mel_filename = 'femalemandarin-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

  # Return a tuple describing this training example:
  return (spectrogram_filename, mel_filename, n_frames, pinyin)
Esempio n. 11
0
def run_eval(args):
  print(hparams_debug_string())
  is_teacher_force = False
  mel_targets = args.mel_targets
  reference_mel = None
  if args.mel_targets is not None:
    is_teacher_force = True
    mel_targets = np.load(args.mel_targets)
  synth = Synthesizer(teacher_forcing_generating=is_teacher_force)
  synth.load(args.checkpoint, args.reference_audio)
  base_path = get_output_base_path(args.checkpoint)

  if args.reference_audio is not None:
    ref_wav = audio.load_wav(args.reference_audio)
    reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
    path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference_audio))[0])
  else:
    if hparams.use_gst:
      print("*******************************")
      print("TODO: add style weights when there is no reference audio. Now we use random weights, " + 
             "which may generate unintelligible audio sometimes.")
      print("*******************************")
      path = '%s_ref-randomWeight.wav' % (base_path)
    else:
      raise ValueError("You must set the reference audio if you don't want to use GSTs.")

  with open(path, 'wb') as f:
    print('Synthesizing: %s' % args.text)
    print('Output wav file: %s' % path)
    f.write(synth.synthesize(args.text, reference_mel=reference_mel))
Esempio n. 12
0
    def synthesize(self,
                   path_in,
                   path_re,
                   mel_targets=None,
                   reference_mel=None,
                   alignment_path=None):
        wav_in = audio.load_wav(path_in)
        wav_re = audio.load_wav(path_re)
        mel_in = audio.melspectrogram(wav_in).astype(np.float32)
        mel_re = audio.melspectrogram(wav_re).astype(np.float32)
        # print(mel_jp)
        feed_dict = {
            self.model.inputs: [mel_in.T],
            self.model.input_lengths: np.asarray([len(mel_in)],
                                                 dtype=np.int32),
            self.model.inputs_jp: [mel_re.T],
        }
        # if mel_targets is not None:
        #   mel_targets = np.expand_dims(mel_targets, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)})
        # if reference_mel is not None:
        #   reference_mel = np.expand_dims(reference_mel, 0)
        #   print(reference_mel.shapex)
        #   feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)})

        wav_out, alignments = self.session.run(
            [self.wav_output, self.alignments], feed_dict=feed_dict)
        wav = audio.inv_preemphasis(wav_out)
        end_point = audio.find_endpoint(wav)
        wav = wav[:end_point]
        nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S")  # 生成当前时间
        randomNum = random.randint(0, 100)  # 生成的随机整数n,其中0<=n<=100
        if randomNum <= 10:
            randomNum = str(0) + str(randomNum)
        uniqueNum = str(nowTime) + str(randomNum)
        out_dir = "static\\out\\" + uniqueNum + ".wav"
        out_name = uniqueNum + ".wav"

        audio.save_wav(wav, out_dir)
        out = io.BytesIO()
        audio.save_wav(wav, out)
        # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1
        # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path))
        return out_dir, out_name
Esempio n. 13
0
def convert_file(audio_path):
    y = audio.load_wav(audio_path)
    peak = np.abs(y).max()
    if hp.peak_norm or peak > 1.0:
        y *= (0.9 / peak)

    linear = audio.spectrogram(y)
    mel = audio.melspectrogram(y)
    return mel.astype(np.float32), linear.astype(np.float32)
Esempio n. 14
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # text to pinyin
    text = text.replace("#1", "").replace("#2",
                                          "").replace("#3",
                                                      "").replace("#4", "")
    pinyin = " ".join(get_pinyin(text))

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    wav = wav / np.max(np.abs(wav)) * 0.9  # norm

    # denoise
    if hparams.mmse_denoise_by_bothEndOfAudio and len(
            wav) > hparams.sample_rate * (hparams.length_as_noise * 2 + 0.1):
        noise_wav = np.concatenate([
            wav[:int(hparams.sample_rate * hparams.length_as_noise)],
            wav[-int(hparams.sample_rate * hparams.length_as_noise):]
        ])
        profile = logmmse.profile_noise(noise_wav, hparams.sample_rate)
        wav = logmmse.denoise(wav, profile, eta=0)

    # trim silence
    wav = audio.trim_silence(
        wav, hparams.trim_top_db)  # top_db=30 for aishell, 60 for BZNSYP
    # audio.save_wav(wav, wav_path.replace(".wav", "_trimed.wav"))

    # convert wav to 16bit int
    wav *= 32768
    wav = wav.astype(np.int16)

    # extract LPC feature
    extractor = lpcnet.FeatureExtractor()
    feat = extractor.compute_feature(wav)
    n_frames = feat.shape[0]

    # write the lpc feature to disk
    feature_filename = 'biaobei-lpc-feat-%05d.npy' % index
    np.save(os.path.join(out_dir, feature_filename), feat, allow_pickle=False)

    # Return a tuple describing this training example:
    return (feature_filename, n_frames, pinyin)
Esempio n. 15
0
def preprocess_utterance(wav_file,input_path, output_path):
    wav = audio.load_wav(wav_file)
    wav_path, name = os.path.split(wav_file)
    out_dir = wav_path.replace(input_path,output_path)
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    mel_filename = name.replace('.wav','.npy')
    np.save(os.path.join(out_dir, mel_filename),mel_spectrogram.T,allow_pickle=False)
    print(mel_filename,mel_spectrogram.shape[1])
Esempio n. 16
0
def _process_utterance(out_dir, index, wav_path, text):
  wav = audio.load_wav(wav_path)
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'selvas-spec-%04d.npy' % int(index)
  mel_filename = 'selvas-mel-%04d.npy' % int(index)
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 17
0
def run_eval(args):
  print(hparams_debug_string())
  synth = Synthesizer()
  synth.load(args.checkpoint)
  base_path = get_output_base_path(args.checkpoint)
  wav = load_wav(args.reference_audio)
  mel = melspectrogram(wav).transpose()
  for i, text in enumerate(sentences):
    path = '%s-%d.wav' % (base_path, i)
    print('Synthesizing: %s' % path)
    with open(path, 'wb') as f:
      f.write(synth.synthesize(text, mel))
Esempio n. 18
0
def _process_utterance(out_dir, index, wav_path):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
        out_dir: The directory to write the spectrograms into
        index: The numeric index to use in the spectrogram filenames.
        wav_path: Path to the audio file containing the speech input
        text: The text spoken in the input audio file

    Returns:
        A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # cut or pad wav into 2s
    length = hparams.sample_rate * hparams.duration
    wav = librosa.util.fix_length(wav, length)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Computer mfcc
    # mfcc = audio.mfcc(wav).astype(np.float32)

    # Write the spectrograms to disk:
    wav_name = os.path.basename(wav_path)
    wav_name = wav_name.split('.')[0]
    spectrogram_filename = 'spec-%s.npy' % wav_name
    mel_filename = 'mel-%s.npy' % wav_name
    mfcc_filename = 'mfcc-%s.npy' % wav_name
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    # np.save(
    #     os.path.join(out_dir, mfcc_filename),
    #     mfcc.T,
    #     allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames)
Esempio n. 19
0
def _process_utterance(out_dir, index, wav_path, text, person_id):
    # Load the wav file and trim silence from the ends:
    wav = audio.load_wav(wav_path)
    #max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    #if len(wav) > max_samples:
    #    return None
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'arctic-spec-%05d.npy' % index
    mel_filename = 'arctic-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
    return (spectrogram_filename, mel_filename, n_frames, text, person_id)
Esempio n. 20
0
def _process_utterance(out_dir, index, wav_path, text):
  wav, _ = audio.load_wav(wav_path)

  spectrogram = audio.spectrogram(wav).astype(np.float32)  # (1025, frame)
  n_frames = spectrogram.shape[1]

  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)  # (80, frame)

  spectrogram_filename = 'kss-spec-%05d.npy' % index
  mel_filename = 'kss-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)  # (frame, 1025)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)  # (frame, 80)

  return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 21
0
def _process_utterance(out_dir, name, wav_path, text):
    wav = audio.load_wav(wav_path)
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    spectrogram_filename = 'bznsyp-spec-%s.npy' % name
    mel_filename = 'bznsyp-mel-%s.npy' % name
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
    #text = sentence_to_pinyin(text)
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 22
0
def __generate_spectrograms(file_path, category, index, out_dir):
    wav = audio.load_wav(file_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # Write the spectrograms to disk:
    spectrogram_filename = '{}spec{}.npy'.format(category, index)
    mel_filename = '{}mel{}.npy'.format(category, index)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)
Esempio n. 23
0
def get_wav_linear_and_mel_targert(wav_path, set_spec_length=None):
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]
    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    # Return a tuple describing this training example:
    if set_spec_length is not None:
        return (spectrogram.T[:set_spec_length],
                mel_spectrogram.T[:set_spec_length], n_frames)
    #wav = wav.reshape(-1, 1)
    #wav = np.pad(wav, [[2048, 0], [0, 0]], 'constant')
    #wav = np.pad(wav, [[2048, 0]], 'constant')
    return (wav, spectrogram.T, mel_spectrogram.T, n_frames)
Esempio n. 24
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    print('wave_path :', wav_path)
    wav = audio.load_wav(wav_path)
    print('wav :', wav.shape, 'sr:')

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    #print('spectrogram: ', spectrogram, '\nspectrogram,shape: ', spectrogram.shape)
    n_frames = spectrogram.shape[1]
    print('n_frames : ', n_frames)

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
    #print('melspectrogram: ', mel_spectrogram, '\nspectrogram,shape: ', mel_spectrogram.shape)

    # Write the spectrograms to disk:
    spectrogram_filename = 'ljspeech-spec-%05d.npy' % index
    mel_filename = 'ljspeech-mel-%05d.npy' % index
    print('spectrogram_filename:', spectrogram_filename)
    print('mel_filename:', mel_filename)
    print('out_dir: ', out_dir)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 25
0
def _process_utterance(out_dir, index, wav_path, labels_path, text):
  # Load the wav file and trim silence from the ends:
  wav = audio.load_wav(wav_path)
  start_offset, end_offset = _parse_labels(labels_path)
  start = int(start_offset * hparams.sample_rate)
  end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1
  wav = wav[start:end]
  max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
  if len(wav) > max_samples:
    return None
  spectrogram = audio.spectrogram(wav).astype(np.float32)
  n_frames = spectrogram.shape[1]
  mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)
  spectrogram_filename = 'blizzard-spec-%05d.npy' % index
  mel_filename = 'blizzard-mel-%05d.npy' % index
  np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
  np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
  return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 26
0
def _process_utterance(wav_path, text, id):
    '''Preprocesses a single utterance audio/text pair.
    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.
    Args:
      wav_path: Path to the audio file containing the speech input
      seq: The text in the input audio file
      id : identity
    Returns:
      A example containing many datas
    '''
    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32).T
    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T
    return wav, spectrogram, mel_spectrogram, text, id
Esempio n. 27
0
def run_eval(args):
    print(hparams_debug_string())
    reference_mel = None
    synth = Synthesizer()
    synth.load(args.checkpoint, args.reference_audio)

    if args.reference_audio is not None:
        ref_wav = audio.load_wav(args.reference_audio)
        reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T

    base_path = get_output_base_path(args.checkpoint)

    for i, text in enumerate(sentences):
        path = '%s_%d_%.1f_%d.wav' % (base_path + '_gst', hparams.gst_index,
                                      hparams.gst_scale, i)
        print('Synthesizing: %s' % path)
        with open(path, 'wb') as f:
            f.write(synth.synthesize(text, reference_mel=reference_mel))
Esempio n. 28
0
def _process_utterance(out_dir, index, wav_path, pinyin):
    wav = audio.load_wav(wav_path)

    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frame = spectrogram.shape[1]
    if n_frame > hp.max_frame_num:
        return None

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    spectrogram_filename = 'thchs30-spec-%05d.npy' % index
    mel_filename = 'thchs30-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    return (spectrogram_filename, mel_filename, n_frame, pinyin)
Esempio n. 29
0
def _process_utterance(out_dir, name, wav_path, text, hparams):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path, hparams)

    # trim silences here
    wav = audio.trim_silence(wav, hparams)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav, hparams).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'mailabs-spec-{}.npy'.format(name)
    mel_filename = 'mailabs-mel-{}.npy'.format(name)
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 30
0
def main():
	accepted_modes = ['eval', 'synthesis', 'live']
	parser = argparse.ArgumentParser()
	parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint')
	parser.add_argument('--hparams', default='',
		help='Hyperparameter overrides as a comma-separated list of name=value pairs')
	parser.add_argument('--name', required = True, help='Name of logging directory.')
	parser.add_argument('--mels_dir', default='gst_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet')
	parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes))
	parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode')
	parser.add_argument('--text', required=True, default=None, help='Single test text sentence')
	parser.add_argument('--reference_audio', default=None, help='Reference audio path')
	parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms')

	
	args = parser.parse_args()
	

	if args.mode not in accepted_modes:
		raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode))

	if args.mode=='live' and args.model=='Wavenet':
		raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!')

	if args.GTA not in ('True', 'False'):
		raise ValueError('GTA option must be either True or False')

	if args.mode == 'live':
		warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!')
	if args.mode == 'synthesis':
		raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)')

	gst_checkpoint, wave_checkpoint, hparams = prepare_run(args)
	sentences = get_sentences(args)
	if args.reference_audio is not None:
		ref_wav = audio.load_wav(args.reference_audio)
		reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T
	else:
		reference_mel = None
	
	synthesize(args, hparams, gst_checkpoint, wave_checkpoint, sentences, reference_mel)
Esempio n. 31
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)
    max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate
    if len(wav) > max_samples and _max_out_length is not None:
        return None

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'blizzard2013-spec-%05d.npy' % index
    mel_filename = 'blizzard2013-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)
Esempio n. 32
0
def _process_utterance(out_dir, index, wav_path, text):
    '''Preprocesses a single utterance audio/text pair.

  This writes the mel and linear scale spectrograms to disk and returns a tuple to write
  to the train.txt file.

  Args:
    out_dir: The directory to write the spectrograms into
    index: The numeric index to use in the spectrogram filenames.
    wav_path: Path to the audio file containing the speech input
    text: The text spoken in the input audio file

  Returns:
    A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
  '''
    #Convert Unicode to CISAMPA
    url = 'http://127.0.0.1:8080/get_sentence/' + text
    text = requests.get(url).text

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    # Compute the linear-scale spectrogram from the wav:
    spectrogram = audio.spectrogram(wav).astype(np.float32)
    n_frames = spectrogram.shape[1]

    # Compute a mel-scale spectrogram from the wav:
    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

    # Write the spectrograms to disk:
    spectrogram_filename = 'urdu-spec-%05d.npy' % index
    mel_filename = 'urdu-mel-%05d.npy' % index
    np.save(os.path.join(out_dir, spectrogram_filename),
            spectrogram.T,
            allow_pickle=False)
    np.save(os.path.join(out_dir, mel_filename),
            mel_spectrogram.T,
            allow_pickle=False)

    # Return a tuple describing this training example:
    return (spectrogram_filename, mel_filename, n_frames, text)