class LJSpeech: def __init__(self, in_dir, out_dir, hparams): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) def text_and_audio_path_rdd(self, sc: SparkContext): return sc.parallelize(self._extract_all_text_and_audio_path()) def process_data(self, rdd: RDD): return rdd.mapValues(self._process_source_and_target) def _extract_text_and_path(self, line, index): parts = line.strip().split('|') key = parts[0] wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % parts[0]) text = parts[2] return TextAndAudioPath(index, key, wav_path, text) def _extract_all_text_and_audio_path(self): index = 1 with open(os.path.join(self.in_dir, 'metadata.csv'), mode='r', encoding='utf-8') as f: for line in f: extracted = self._extract_text_and_path(line, index) if extracted is not None: yield (index, extracted) index += 1 def _process_source_and_target(self, paths: TextAndAudioPath): wav = self.audio.load_wav(paths.wav_path) n_samples = len(wav) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T n_frames = mel_spectrogram.shape[0] filename = f"{paths.key}.tfrecord" filepath = os.path.join(self.out_dir, filename) tfrecord.write_preprocessed_data(paths.id, paths.key, wav, mel_spectrogram, paths.text, filepath) return SourceAndTargetMetaData(paths.id, paths.key, n_samples, n_frames, filepath) def _process_mel(self, paths: TextAndAudioPath): wav = self.audio.load_wav(paths.wav_path) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T sum_mel_powers = np.sum(mel_spectrogram, axis=1) n_frames = mel_spectrogram.shape[0] return MelMetaData(n_frames, sum_mel_powers)
class LJSpeech: def __init__(self, in_dir, mel_out_dir, wav_out_dir, hparams): self.in_dir = in_dir self.mel_out_dir = mel_out_dir self.wav_out_dir = wav_out_dir self.audio = Audio(hparams) @property def record_ids(self): return map(lambda v: str(v), range(1, 13101)) def record_file_path(self, record_id, kind): assert kind in ["source", "target"] return os.path.join(self.mel_out_dir, f"ljspeech-{kind}-{int(record_id):05d}.tfrecord") def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize(self._extract_all_text_and_path()) def process_wav(self, rdd: RDD): return rdd.mapValues(self._process_wav) def _extract_text_and_path(self, line, index): parts = line.strip().split('|') key = parts[0] text = parts[2] wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key) return TextAndPath(index, key, wav_path, None, text) def _extract_all_text_and_path(self): with open(os.path.join(self.in_dir, 'metadata.csv'), mode='r', encoding='utf-8') as f: for index, line in enumerate(f): extracted = self._extract_text_and_path(line, index) if extracted is not None: yield (index, extracted) def _process_wav(self, paths: TextAndPath): wav = self.audio.load_wav(paths.wav_path) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T mel_spectrogram = self.audio.normalize_mel(mel_spectrogram) mel_filepath = os.path.join(self.mel_out_dir, f"{paths.key}.mfbsp") wav_filepath = os.path.join(self.wav_out_dir, f"{paths.key}.wav") mel_spectrogram.tofile(mel_filepath, format="<f4") self.audio.save_wav(wav, wav_filepath)
class VCTK: def __init__(self, in_dir, out_dir, hparams, speaker_info_filename='speaker-info.txt'): self.in_dir = in_dir self.out_dir = out_dir self.speaker_info_filename = speaker_info_filename self.audio = Audio(hparams) def list_files(self): def wav_files(speaker_info: SpeakerInfo): wav_dir = os.path.join(self.in_dir, f"wav48/p{speaker_info.id}") return [ os.path.join(wav_dir, wav_file) for wav_file in sorted(os.listdir(wav_dir)) if wav_file.endswith('.wav') ] def text_files(speaker_info: SpeakerInfo): txt_dir = os.path.join(self.in_dir, f"txt/p{speaker_info.id}") return [ os.path.join(txt_dir, txt_file) for txt_file in sorted(os.listdir(txt_dir)) if txt_file.endswith('.txt') ] def text_and_wav_records(file_pairs, speaker_info): def create_record(txt_f, wav_f, speaker_info): key1 = os.path.basename(wav_f).strip('.wav') key2 = os.path.basename(txt_f).strip('.txt') assert key1 == key2 return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info) return [ create_record(txt_f, wav_f, speaker_info) for txt_f, wav_f in file_pairs ] records = sum([ text_and_wav_records(zip(text_files(si), wav_files(si)), si) for si in self._load_speaker_info() ], []) return [ TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info) for i, r in enumerate(records) ] def process_sources(self, rdd: RDD): return rdd.map(self._process_txt) def process_targets(self, rdd: RDD): return TargetRDD( rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK)) def _load_speaker_info(self): with open(os.path.join(self.in_dir, self.speaker_info_filename), mode='r', encoding='utf8') as f: for l in f.readlines()[1:]: si = l.split() gender = 0 if si[2] == 'F' else 1 if str(si[0]) != "315": # FixMe: Why 315 is missing? yield SpeakerInfo(int(si[0]), int(si[1]), gender) def _process_wav(self, record: TxtWavRecord): wav = self.audio.load_wav(record.wav_path) wav = self.audio.trim(wav) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord") write_preprocessed_target_data(record.id, record.key, mel_spectrogram, file_path) return MelStatistics(id=record.id, key=record.key, min=np.min(mel_spectrogram, axis=0), max=np.max(mel_spectrogram, axis=0), sum=np.sum(mel_spectrogram, axis=0), length=len(mel_spectrogram), moment2=np.sum(np.square(mel_spectrogram), axis=0)) def _process_txt(self, record: TxtWavRecord): with open(os.path.join(self.in_dir, record.txt_path), mode='r', encoding='utf8') as f: txt = f.readline().rstrip("\n") sequence, clean_text = text_to_sequence(txt, basic_cleaners) source = np.array(sequence, dtype=np.int64) file_path = os.path.join(self.out_dir, f"{record.key}.source.tfrecord") write_preprocessed_source_data(record.id, record.key, source, clean_text, record.speaker_info.id, record.speaker_info.age, record.speaker_info.gender, file_path) return record.key
class LJSpeech: def __init__(self, in_dir, out_dir, hparams): self.in_dir = in_dir self.out_dir = out_dir self.audio = Audio(hparams) @property def record_ids(self): return map(lambda v: str(v), range(1, 13101)) def record_file_path(self, record_id, kind): assert kind in ["source", "target"] return os.path.join(self.out_dir, f"ljspeech-{kind}-{int(record_id):05d}.tfrecord") def text_and_path_rdd(self, sc: SparkContext): return sc.parallelize(self._extract_all_text_and_path()) def process_targets(self, rdd: RDD): return TargetRDD( rdd.mapValues(self._process_target).persist( StorageLevel.MEMORY_AND_DISK)) def process_sources(self, rdd: RDD): return rdd.mapValues(self._process_source) def _extract_text_and_path(self, line, index): parts = line.strip().split('|') key = parts[0] text = parts[2] wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key) return TextAndPath(index, key, wav_path, None, text) def _extract_all_text_and_path(self): with open(os.path.join(self.in_dir, 'metadata.csv'), mode='r', encoding='utf-8') as f: for index, line in enumerate(f): extracted = self._extract_text_and_path(line, index) if extracted is not None: yield (index, extracted) def _text_to_sequence(self, text): sequence, clean_text = text_to_sequence(text, english_cleaners) sequence = np.array(sequence, dtype=np.int64) return sequence, clean_text def _process_target(self, paths: TextAndPath): wav = self.audio.load_wav(paths.wav_path) mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T filename = f"{paths.key}.target.tfrecord" filepath = os.path.join(self.out_dir, filename) write_preprocessed_target_data(paths.id, paths.key, mel_spectrogram, filepath) return MelStatistics(id=paths.id, key=paths.key, min=np.min(mel_spectrogram, axis=0), max=np.max(mel_spectrogram, axis=0), sum=np.sum(mel_spectrogram, axis=0), length=len(mel_spectrogram), moment2=np.sum(np.square(mel_spectrogram), axis=0)) def _process_source(self, paths: TextAndPath): sequence, clean_text = self._text_to_sequence(paths.text) filename = f"{paths.key}.source.tfrecord" filepath = os.path.join(self.out_dir, filename) write_preprocessed_source_data(paths.id, paths.key, sequence, clean_text, filepath) return paths.key
class Synthesizer(): def __init__(self, model_path, out_dir, text_file, sil_file, use_griffin_lim, gen_wavenet_fea, hparams): self.out_dir = out_dir self.text_file = text_file self.sil_file = sil_file self.use_griffin_lim = use_griffin_lim self.gen_wavenet_fea = gen_wavenet_fea self.hparams = hparams self.model = get_model(model_path, hparams) self.audio_class = Audio(hparams) if hparams.use_phone: from text.phones import Phones phone_class = Phones(hparams.phone_set_file) self.text_to_sequence = phone_class.text_to_sequence else: from text import text_to_sequence self.text_to_sequence = text_to_sequence if hparams.is_multi_speakers and not hparams.use_pretrained_spkemb: self.speaker_id_dict = gen_speaker_id_dict(hparams) self.out_png_dir = os.path.join(self.out_dir, 'png') os.makedirs(self.out_png_dir, exist_ok=True) if self.use_griffin_lim: self.out_wav_dir = os.path.join(self.out_dir, 'wav') os.makedirs(self.out_wav_dir, exist_ok=True) if self.gen_wavenet_fea: self.out_mel_dir = os.path.join(self.out_dir, 'mel') os.makedirs(self.out_mel_dir, exist_ok=True) def get_mel_gt(self, wavname): hparams = self.hparams if not hparams.load_mel: if hparams.use_hdf5: with h5py.File(hparams.hdf5_file, 'r') as h5: data = h5[wavname][:] else: filename = os.path.join(hparams.wav_dir, wavname + '.wav') sr_t, audio = wavread(filename) assert sr_t == hparams.sample_rate audio_norm = audio / hparams.max_wav_value wav = self.audio_class._preemphasize(audio_norm) melspec = self.audio_class.melspectrogram(wav, clip_norm=True) melspec = torch.FloatTensor(melspec.astype(np.float32)) else: if hparams.use_zip: with zipfile.ZipFile(hparams.zip_path, 'r') as f: data = f.read(wavname) melspec = np.load(io.BytesIO(data)) melspec = torch.FloatTensor(melspec.astype(np.float32)) elif hparams.use_hdf5: with h5py.File(hparams.hdf5_file, 'r') as h5: melspec = h5[wavname][:] melspec = torch.FloatTensor(melspec.astype(np.float32)) else: filename = os.path.join(hparams.wav_dir, wavname + '.npy') melspec = torch.from_numpy(np.load(filename)) melspec = torch.unsqueeze(melspec, 0) return melspec def get_inputs(self, meta_data): hparams = self.hparams # Prepare text input # filename = meta_data['n'] # filename = os.path.splitext(os.path.basename(filename))[0] filename = meta_data[0].strip().split('|')[0] print(meta_data[0].strip().split('|')[-1]) print(meta_data[0].strip().split('|')[1]) sequence = np.array( self.text_to_sequence(meta_data[0].strip().split('|')[-1], ['english_cleaners'])) # [None, :] # sequence = torch.autograd.Variable( # torch.from_numpy(sequence)).cuda().long() print(sequence) sequence = torch.autograd.Variable( torch.from_numpy(sequence)).to(device).long() if hparams.is_multi_speakers: if hparams.use_pretrained_spkemb: ref_file = meta_data['r'] spk_embedding = np.array(np.load(ref_file)) spk_embedding = torch.autograd.Variable( torch.from_numpy(spk_embedding)).to(device).float() inputs = (sequence, spk_embedding) else: speaker_name = filename.split('_')[0] speaker_id = self.speaker_id_dict[speaker_name] speaker_id = np.array([speaker_id]) # speaker_id = torch.autograd.Variable( # torch.from_numpy(speaker_id)).cuda().long() speaker_id = torch.autograd.Variable( torch.from_numpy(speaker_id)).to(device).long() inputs = (sequence, speaker_id) if hparams.is_multi_styles: style_id = np.array([int(meta_data[0].strip().split('|')[1])]) style_id = torch.autograd.Variable( torch.from_numpy(style_id)).to(device).long() inputs = (sequence, style_id) elif hparams.use_vqvae: ref_file = meta_data['r'] spk_ref = self.get_mel_gt(ref_file) inputs = (sequence, spk_ref) else: inputs = (sequence) return inputs, filename def gen_mel(self, meta_data): inputs, filename = self.get_inputs(meta_data) speaker_id = None style_id = None spk_embedding = None spk_ref = None if self.hparams.is_multi_speakers: if self.hparams.use_pretrained_spkemb: sequence, spk_embedding = inputs else: sequence, speaker_id = inputs elif hparams.use_vqvae: sequence, spk_ref = inputs else: sequence = inputs if self.hparams.is_multi_styles: sequence, style_id = inputs # Decode text input and plot results with torch.no_grad(): mel_outputs, gate_outputs, att_ws = self.model.inference( sequence, self.hparams, spk_id=speaker_id, style_id=style_id, spemb=spk_embedding, spk_ref=spk_ref) duration_list = DurationCalculator._calculate_duration(att_ws) print('att_ws.shape=', att_ws.shape) print('duration=', duration_list) print('duration_sum=', torch.sum(duration_list)) print('focus_rete=', DurationCalculator._calculate_focus_rete(att_ws)) # print(mel_outputs.shape) # (length, dim) mel_outputs = mel_outputs.transpose( 0, 1).float().data.cpu().numpy() # (dim, length) mel_outputs_with_duration = get_duration_matrix( char_text_dir=self.text_file, duration_tensor=duration_list, save_mode='phone').transpose(0, 1).float().data.cpu().numpy() gate_outputs = gate_outputs.float().data.cpu().numpy() att_ws = att_ws.float().data.cpu().numpy() image_path = os.path.join(self.out_png_dir, "{}_att.png".format(filename)) _plot_and_save(att_ws, image_path) image_path = os.path.join(self.out_png_dir, "{}_spec_stop.png".format(filename)) fig, axes = plt.subplots(3, 1, figsize=(8, 8)) axes[0].imshow(mel_outputs, aspect='auto', origin='bottom', interpolation='none') axes[1].imshow(mel_outputs_with_duration, aspect='auto', origin='bottom', interpolation='none') axes[2].scatter(range(len(gate_outputs)), gate_outputs, alpha=0.5, color='red', marker='.', s=5, label='predicted') plt.savefig(image_path, format='png') plt.close() return mel_outputs, filename def gen_wav_griffin_lim(self, mel_outputs, filename): grf_wav = self.audio_class.inv_mel_spectrogram(mel_outputs) grf_wav = self.audio_class.inv_preemphasize(grf_wav) wav_path = os.path.join(self.out_wav_dir, "{}-gl.wav".format(filename)) self.audio_class.save_wav(grf_wav, wav_path) def gen_wavenet_feature(self, mel_outputs, filename, add_end_sil=True): # denormalize mel = self.audio_class._denormalize(mel_outputs) # normalize to 0-1 mel = np.clip(((mel - self.audio_class.hparams.min_level_db) / (-self.audio_class.hparams.min_level_db)), 0, 1) mel = mel.T.astype(np.float32) frame_size = 200 SILSEG = 0.3 SAMPLING = 16000 sil_samples = int(SILSEG * SAMPLING) sil_frames = int(sil_samples / frame_size) sil_data, _ = soundfile.read(self.sil_file) sil_data = sil_data[:sil_samples] sil_mel_spec, _ = self.audio_class._magnitude_spectrogram( sil_data, clip_norm=True) sil_mel_spec = (sil_mel_spec + 4.0) / 8.0 pad_mel_data = np.concatenate((sil_mel_spec[:sil_frames], mel), axis=0) if add_end_sil: pad_mel_data = np.concatenate( (pad_mel_data, sil_mel_spec[:sil_frames]), axis=0) out_mel_file = os.path.join(self.out_mel_dir, '{}-wn.mel'.format(filename)) save_htk_data(pad_mel_data, out_mel_file) def inference_f(self): # print(meta_data['n']) meta_data = _read_meta_yyh(self.text_file) mel_outputs, filename = self.gen_mel(meta_data) print('my_mel_outputs=', mel_outputs) print('my_mel_outputs_max=', np.max(mel_outputs)) print('my_mel_outputs_min=', np.min(mel_outputs)) mel_outputs = np.load(r'../out_0.npy').transpose(1, 0) mel_outputs = mel_outputs * 8.0 - 4.0 print('his_mel_outputs=', mel_outputs) print('his_mel_outputs_max=', np.max(mel_outputs)) print('his_mel_outputs_min=', np.min(mel_outputs)) if self.use_griffin_lim: self.gen_wav_griffin_lim(mel_outputs, filename) if self.gen_wavenet_fea: self.gen_wavenet_feature(mel_outputs, filename) return filename def inference(self): all_meta_data = _read_meta(self.text_file, hparams.meta_format) list(map(self.inference_f, all_meta_data))