def _process_utterance(out_dir, index, tar_cd_path, in_jd_path, in_cg_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: tar_cd_wav = audio.load_wav(tar_cd_path) # Compute the linear-scale spectrogram from the wav: tar_cd_spectrogram = audio.spectrogram(tar_cd_wav).astype(np.float32) n_frames = tar_cd_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: tar_cd_mel_spectrogram = audio.melspectrogram(tar_cd_wav).astype( np.float32) in_jd_wav = audio.load_wav(in_jd_path) in_cg_wav = audio.load_wav(in_cg_path) # Compute the linear-scale spectrogram from the wav: # Beacase of use voice traing,needless spectrogram. #in_spectrogram = audio.spectrogram(in_cg_wav).astype(np.float32) # Compute the mel-scale spectrogram from the wav: in_jd_mel_spectrogram = audio.melspectrogram(in_jd_wav).astype(np.float32) in_cg_mel_spectrogram = audio.melspectrogram(in_cg_wav).astype(np.float32) # Write the spectrograms to disk: in_jd_mel_spectrogram_filename = 'Imuspeech-in_jd_mel_spec-%05d.npy' % index in_cg_mel_spectrogram_filename = 'Imuspeech-in_cg_mel_spec-%05d.npy' % index tar_cd_spectrogram_filename = 'Imuspeech-tar_cd_spec-%05d.npy' % index tar_cd_mel_filename = 'Imuspeech-tar_cd_mel-%05d.npy' % index np.save(os.path.join(out_dir, in_jd_mel_spectrogram_filename), in_jd_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, in_cg_mel_spectrogram_filename), in_cg_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tar_cd_spectrogram_filename), tar_cd_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tar_cd_mel_filename), tar_cd_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (tar_cd_spectrogram_filename, tar_cd_mel_filename, n_frames, in_jd_mel_spectrogram_filename, in_cg_mel_spectrogram_filename)
def _process_utterance(out_dir, index, src_path, tgt_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. src_path: Path to the source audio file tgt_path: Path to the target audio file Returns: A (tgt_spectrogram_filename, tgt_mel_filename, n_frames, src_spectogram_filename) tuple to write to train.txt ''' # Load the audio to a numpy array: src_wav = audio.load_wav(src_path) tgt_wav = audio.load_wav(tgt_path) # Compute the linear-scale spectrogram from the wav: src_spectrogram = audio.spectrogram( src_wav, num_src_freq=hparams.num_src_freq, frame_length_ms=hparams.src_frame_length_ms).astype(np.float32) src_n_frames = src_spectrogram.shape[1] tgt_spectrogram = audio.spectrogram(tgt_wav).astype(np.float32) tgt_n_frames = tgt_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: src_mel_spectrogram = audio.melspectrogram(src_wav).astype(np.float32) tgt_mel_spectrogram = audio.melspectrogram(tgt_wav).astype(np.float32) # Write the spectrograms to disk: src_spectrogram_filename = 'wav2wav_src-spec-%05d.npy' % index src_mel_filename = 'wav2wav_src-mel-%05d.npy' % index np.save(os.path.join(out_dir, src_spectrogram_filename), src_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, src_mel_filename), src_mel_spectrogram.T, allow_pickle=False) tgt_spectrogram_filename = 'wav2wav_tgt-spec-%05d.npy' % index tgt_mel_filename = 'wav2wav_tgt-mel-%05d.npy' % index np.save(os.path.join(out_dir, tgt_spectrogram_filename), tgt_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, tgt_mel_filename), tgt_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (tgt_spectrogram_filename, tgt_mel_filename, tgt_n_frames, src_spectrogram_filename)
def _process_utterance(out_dir, index, wav_path_neutral, wav_path_happy): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav1 = audio.load_wav(wav_path_neutral) wav2 = audio.load_wav(wav_path_happy) # Compute the neutral linear-scale spectrogram from the wav: spectrogram_neutral = audio.spectrogram(wav1).astype(np.float32) n_frames = spectrogram_neutral.shape[1] # Compute a neutral mel-scale spectrogram from the wav: mel_spectrogram_neutral = audio.melspectrogram(wav1).astype(np.float32) spectrogram_happy = audio.spectrogram(wav2).astype(np.float32) n_frames = spectrogram_happy.shape[1] mel_spectrogram_happy = audio.melspectrogram(wav2).astype(np.float32) # Write the spectrograms to disk: spectrogram_neutral_filename = 'neutral-spec-%05d.npy' % index mel_neutral_filename = 'neutral-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_neutral_filename), spectrogram_neutral.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_neutral_filename), mel_spectrogram_neutral.T, allow_pickle=False) spectrogram_happy_filename = 'happy-spec-%05d.npy' % index mel_happy_filename = 'happy-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_happy_filename), spectrogram_happy.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_happy_filename), mel_spectrogram_happy.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_neutral_filename, mel_neutral_filename, spectrogram_happy_filename, mel_happy_filename, n_frames)
def _process_utterance(out_dir, index, source_wav_path, target_wav_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: source_wav = audio.load_wav(source_wav_path) target_wav = audio.load_wav(target_wav_path) # Compute the linear-scale spectrogram from the wav: target_spectrogram = audio.spectrogram(target_wav).astype(np.float32) n_frames = target_spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: source_mel_spectrogram = audio.melspectrogram(source_wav).astype( np.float32) target_mel_spectrogram = audio.melspectrogram(target_wav).astype( np.float32) # Write the spectrograms to disk: #source_spectrogram_filename = 'source-spec-%05d.npy' % index source_mel_filename = 'source-mel-%05d.npy' % index target_spectrogram_filename = 'target-spec-%05d.npy' % index target_mel_filename = 'target-mel-%05d.npy' % index #np.save(os.path.join(out_dir, source_spectrogram_filename), source_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, source_mel_filename), source_mel_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, target_spectrogram_filename), target_spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, target_mel_filename), target_mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (source_mel_filename, n_frames, target_spectrogram_filename, target_mel_filename)
def run_eval(args): #print(hparams_debug_string()) is_teacher_force = False reference_mel = None synth = Synthesizer(teacher_forcing_generating=is_teacher_force) synth.load(args.model, args.reference) base_path = get_output_base_path(args.model) if args.reference is not None: ref_wav = audio.load_wav(args.reference) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T #path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference))[0]) path = 'ref-%s.wav' % (os.path.splitext(os.path.basename(args.reference))[0]) else: raise ValueError("You must set the reference audio.") with open('examples_test.txt', 'r') as fs: lines = fs.readlines() for i, line in enumerate(lines): args.text = line.strip().split('|')[-1] path_id = '%d_' %(i+6) new_path = path_id + path print('Synthesizing: %s' % args.text) print('Output wav file: %s' % new_path) with open(new_path, 'wb') as f: f.write(synth.synthesize(args.text, reference_mel=reference_mel))
def _process_utterance(out_dir, index, wav_path, labels_path, text, person_id=1): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples: return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text, person_id)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def process_utterance(out_path, index, wav_path, text): ''' generate linear and mel scale spectrograms for each text, wav pairs and save the np array into disk return the file names of the np array files ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index # .T: transpose of narray # allow_pickle: for security and portability not allow np.save(os.path.join(out_path, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_path, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, prompt_id, wav_path, text): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Trim leading and trailing silence: margin = int(hparams.sample_rate * 0.1) wav = wav[margin:-margin] wav, _ = librosa.effects.trim(wav, top_db=40, frame_length=1024, hop_length=256) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'amy-spec-%s.npy' % prompt_id mel_filename = 'amy-mel-%s.npy' % prompt_id np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, pinyin): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input pinyin: The pinyin of Chinese spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'femalemandarin-spec-%05d.npy' % index mel_filename = 'femalemandarin-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, pinyin)
def run_eval(args): print(hparams_debug_string()) is_teacher_force = False mel_targets = args.mel_targets reference_mel = None if args.mel_targets is not None: is_teacher_force = True mel_targets = np.load(args.mel_targets) synth = Synthesizer(teacher_forcing_generating=is_teacher_force) synth.load(args.checkpoint, args.reference_audio) base_path = get_output_base_path(args.checkpoint) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T path = '%s_ref-%s.wav' % (base_path, os.path.splitext(os.path.basename(args.reference_audio))[0]) else: if hparams.use_gst: print("*******************************") print("TODO: add style weights when there is no reference audio. Now we use random weights, " + "which may generate unintelligible audio sometimes.") print("*******************************") path = '%s_ref-randomWeight.wav' % (base_path) else: raise ValueError("You must set the reference audio if you don't want to use GSTs.") with open(path, 'wb') as f: print('Synthesizing: %s' % args.text) print('Output wav file: %s' % path) f.write(synth.synthesize(args.text, reference_mel=reference_mel))
def synthesize(self, path_in, path_re, mel_targets=None, reference_mel=None, alignment_path=None): wav_in = audio.load_wav(path_in) wav_re = audio.load_wav(path_re) mel_in = audio.melspectrogram(wav_in).astype(np.float32) mel_re = audio.melspectrogram(wav_re).astype(np.float32) # print(mel_jp) feed_dict = { self.model.inputs: [mel_in.T], self.model.input_lengths: np.asarray([len(mel_in)], dtype=np.int32), self.model.inputs_jp: [mel_re.T], } # if mel_targets is not None: # mel_targets = np.expand_dims(mel_targets, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.mel_targets: np.asarray(mel_targets, dtype=np.float32)}) # if reference_mel is not None: # reference_mel = np.expand_dims(reference_mel, 0) # print(reference_mel.shapex) # feed_dict.update({self.model.reference_mel: np.asarray(reference_mel, dtype=np.float32)}) wav_out, alignments = self.session.run( [self.wav_output, self.alignments], feed_dict=feed_dict) wav = audio.inv_preemphasis(wav_out) end_point = audio.find_endpoint(wav) wav = wav[:end_point] nowTime = datetime.datetime.now().strftime("%Y%m%d%H%M%S") # 生成当前时间 randomNum = random.randint(0, 100) # 生成的随机整数n,其中0<=n<=100 if randomNum <= 10: randomNum = str(0) + str(randomNum) uniqueNum = str(nowTime) + str(randomNum) out_dir = "static\\out\\" + uniqueNum + ".wav" out_name = uniqueNum + ".wav" audio.save_wav(wav, out_dir) out = io.BytesIO() audio.save_wav(wav, out) # n_frame = int(end_point / (hparams.frame_shift_ms / 1000* hparams.sample_rate)) + 1 # plot.plot_alignment(alignments[:,:n_frame], alignment_path, info='%s' % (path)) return out_dir, out_name
def convert_file(audio_path): y = audio.load_wav(audio_path) peak = np.abs(y).max() if hp.peak_norm or peak > 1.0: y *= (0.9 / peak) linear = audio.spectrogram(y) mel = audio.melspectrogram(y) return mel.astype(np.float32), linear.astype(np.float32)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # text to pinyin text = text.replace("#1", "").replace("#2", "").replace("#3", "").replace("#4", "") pinyin = " ".join(get_pinyin(text)) # Load the audio to a numpy array: wav = audio.load_wav(wav_path) wav = wav / np.max(np.abs(wav)) * 0.9 # norm # denoise if hparams.mmse_denoise_by_bothEndOfAudio and len( wav) > hparams.sample_rate * (hparams.length_as_noise * 2 + 0.1): noise_wav = np.concatenate([ wav[:int(hparams.sample_rate * hparams.length_as_noise)], wav[-int(hparams.sample_rate * hparams.length_as_noise):] ]) profile = logmmse.profile_noise(noise_wav, hparams.sample_rate) wav = logmmse.denoise(wav, profile, eta=0) # trim silence wav = audio.trim_silence( wav, hparams.trim_top_db) # top_db=30 for aishell, 60 for BZNSYP # audio.save_wav(wav, wav_path.replace(".wav", "_trimed.wav")) # convert wav to 16bit int wav *= 32768 wav = wav.astype(np.int16) # extract LPC feature extractor = lpcnet.FeatureExtractor() feat = extractor.compute_feature(wav) n_frames = feat.shape[0] # write the lpc feature to disk feature_filename = 'biaobei-lpc-feat-%05d.npy' % index np.save(os.path.join(out_dir, feature_filename), feat, allow_pickle=False) # Return a tuple describing this training example: return (feature_filename, n_frames, pinyin)
def preprocess_utterance(wav_file,input_path, output_path): wav = audio.load_wav(wav_file) wav_path, name = os.path.split(wav_file) out_dir = wav_path.replace(input_path,output_path) if not os.path.exists(out_dir): os.makedirs(out_dir) mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) mel_filename = name.replace('.wav','.npy') np.save(os.path.join(out_dir, mel_filename),mel_spectrogram.T,allow_pickle=False) print(mel_filename,mel_spectrogram.shape[1])
def _process_utterance(out_dir, index, wav_path, text): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'selvas-spec-%04d.npy' % int(index) mel_filename = 'selvas-mel-%04d.npy' % int(index) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def run_eval(args): print(hparams_debug_string()) synth = Synthesizer() synth.load(args.checkpoint) base_path = get_output_base_path(args.checkpoint) wav = load_wav(args.reference_audio) mel = melspectrogram(wav).transpose() for i, text in enumerate(sentences): path = '%s-%d.wav' % (base_path, i) print('Synthesizing: %s' % path) with open(path, 'wb') as f: f.write(synth.synthesize(text, mel))
def _process_utterance(out_dir, index, wav_path): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # cut or pad wav into 2s length = hparams.sample_rate * hparams.duration wav = librosa.util.fix_length(wav, length) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Computer mfcc # mfcc = audio.mfcc(wav).astype(np.float32) # Write the spectrograms to disk: wav_name = os.path.basename(wav_path) wav_name = wav_name.split('.')[0] spectrogram_filename = 'spec-%s.npy' % wav_name mel_filename = 'mel-%s.npy' % wav_name mfcc_filename = 'mfcc-%s.npy' % wav_name np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # np.save( # os.path.join(out_dir, mfcc_filename), # mfcc.T, # allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames)
def _process_utterance(out_dir, index, wav_path, text, person_id): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) #max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate #if len(wav) > max_samples: # return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'arctic-spec-%05d.npy' % index mel_filename = 'arctic-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text, person_id)
def _process_utterance(out_dir, index, wav_path, text): wav, _ = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) # (1025, frame) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # (80, frame) spectrogram_filename = 'kss-spec-%05d.npy' % index mel_filename = 'kss-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) # (frame, 1025) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # (frame, 80) return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, name, wav_path, text): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'bznsyp-spec-%s.npy' % name mel_filename = 'bznsyp-mel-%s.npy' % name np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) #text = sentence_to_pinyin(text) return (spectrogram_filename, mel_filename, n_frames, text)
def __generate_spectrograms(file_path, category, index, out_dir): wav = audio.load_wav(file_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = '{}spec{}.npy'.format(category, index) mel_filename = '{}mel{}.npy'.format(category, index) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)
def get_wav_linear_and_mel_targert(wav_path, set_spec_length=None): # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Return a tuple describing this training example: if set_spec_length is not None: return (spectrogram.T[:set_spec_length], mel_spectrogram.T[:set_spec_length], n_frames) #wav = wav.reshape(-1, 1) #wav = np.pad(wav, [[2048, 0], [0, 0]], 'constant') #wav = np.pad(wav, [[2048, 0]], 'constant') return (wav, spectrogram.T, mel_spectrogram.T, n_frames)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: print('wave_path :', wav_path) wav = audio.load_wav(wav_path) print('wav :', wav.shape, 'sr:') # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) #print('spectrogram: ', spectrogram, '\nspectrogram,shape: ', spectrogram.shape) n_frames = spectrogram.shape[1] print('n_frames : ', n_frames) # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) #print('melspectrogram: ', mel_spectrogram, '\nspectrogram,shape: ', mel_spectrogram.shape) # Write the spectrograms to disk: spectrogram_filename = 'ljspeech-spec-%05d.npy' % index mel_filename = 'ljspeech-mel-%05d.npy' % index print('spectrogram_filename:', spectrogram_filename) print('mel_filename:', mel_filename) print('out_dir: ', out_dir) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, labels_path, text): # Load the wav file and trim silence from the ends: wav = audio.load_wav(wav_path) start_offset, end_offset = _parse_labels(labels_path) start = int(start_offset * hparams.sample_rate) end = int(end_offset * hparams.sample_rate) if end_offset is not None else -1 wav = wav[start:end] max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples: return None spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'blizzard-spec-%05d.npy' % index mel_filename = 'blizzard-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(wav_path, text, id): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: wav_path: Path to the audio file containing the speech input seq: The text in the input audio file id : identity Returns: A example containing many datas ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32).T # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T return wav, spectrogram, mel_spectrogram, text, id
def run_eval(args): print(hparams_debug_string()) reference_mel = None synth = Synthesizer() synth.load(args.checkpoint, args.reference_audio) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T base_path = get_output_base_path(args.checkpoint) for i, text in enumerate(sentences): path = '%s_%d_%.1f_%d.wav' % (base_path + '_gst', hparams.gst_index, hparams.gst_scale, i) print('Synthesizing: %s' % path) with open(path, 'wb') as f: f.write(synth.synthesize(text, reference_mel=reference_mel))
def _process_utterance(out_dir, index, wav_path, pinyin): wav = audio.load_wav(wav_path) spectrogram = audio.spectrogram(wav).astype(np.float32) n_frame = spectrogram.shape[1] if n_frame > hp.max_frame_num: return None mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) spectrogram_filename = 'thchs30-spec-%05d.npy' % index mel_filename = 'thchs30-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) return (spectrogram_filename, mel_filename, n_frame, pinyin)
def _process_utterance(out_dir, name, wav_path, text, hparams): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path, hparams) # trim silences here wav = audio.trim_silence(wav, hparams) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav, hparams).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'mailabs-spec-{}.npy'.format(name) mel_filename = 'mailabs-mel-{}.npy'.format(name) np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def main(): accepted_modes = ['eval', 'synthesis', 'live'] parser = argparse.ArgumentParser() parser.add_argument('--checkpoint', default='pretrained/', help='Path to model checkpoint') parser.add_argument('--hparams', default='', help='Hyperparameter overrides as a comma-separated list of name=value pairs') parser.add_argument('--name', required = True, help='Name of logging directory.') parser.add_argument('--mels_dir', default='gst_output/eval/', help='folder to contain mels to synthesize audio from using the Wavenet') parser.add_argument('--mode', default='eval', help='mode of run: can be one of {}'.format(accepted_modes)) parser.add_argument('--GTA', default='True', help='Ground truth aligned synthesis, defaults to True, only considered in synthesis mode') parser.add_argument('--text', required=True, default=None, help='Single test text sentence') parser.add_argument('--reference_audio', default=None, help='Reference audio path') parser.add_argument('--output_dir', default='output/', help='folder to contain synthesized mel spectrograms') args = parser.parse_args() if args.mode not in accepted_modes: raise ValueError('accepted modes are: {}, found {}'.format(accepted_modes, args.mode)) if args.mode=='live' and args.model=='Wavenet': raise RuntimeError('Wavenet vocoder cannot be tested live due to its slow generation. Live only works with Tacotron!') if args.GTA not in ('True', 'False'): raise ValueError('GTA option must be either True or False') if args.mode == 'live': warn('Requested a live evaluation with Tacotron-2, Wavenet will not be used!') if args.mode == 'synthesis': raise ValueError('I don\'t recommend running WaveNet on entire dataset.. The world might end before the synthesis :) (only eval allowed)') gst_checkpoint, wave_checkpoint, hparams = prepare_run(args) sentences = get_sentences(args) if args.reference_audio is not None: ref_wav = audio.load_wav(args.reference_audio) reference_mel = audio.melspectrogram(ref_wav).astype(np.float32).T else: reference_mel = None synthesize(args, hparams, gst_checkpoint, wave_checkpoint, sentences, reference_mel)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) max_samples = _max_out_length * hparams.frame_shift_ms / 1000 * hparams.sample_rate if len(wav) > max_samples and _max_out_length is not None: return None # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'blizzard2013-spec-%05d.npy' % index mel_filename = 'blizzard2013-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)
def _process_utterance(out_dir, index, wav_path, text): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' #Convert Unicode to CISAMPA url = 'http://127.0.0.1:8080/get_sentence/' + text text = requests.get(url).text # Load the audio to a numpy array: wav = audio.load_wav(wav_path) # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'urdu-spec-%05d.npy' % index mel_filename = 'urdu-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, mel_filename, n_frames, text)