Beispiel #1
0
class LJSpeech:
    def __init__(self, in_dir, out_dir, hparams):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.audio = Audio(hparams)

    def text_and_audio_path_rdd(self, sc: SparkContext):
        return sc.parallelize(self._extract_all_text_and_audio_path())

    def process_data(self, rdd: RDD):
        return rdd.mapValues(self._process_source_and_target)

    def _extract_text_and_path(self, line, index):
        parts = line.strip().split('|')
        key = parts[0]
        wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % parts[0])
        text = parts[2]
        return TextAndAudioPath(index, key, wav_path, text)

    def _extract_all_text_and_audio_path(self):
        index = 1
        with open(os.path.join(self.in_dir, 'metadata.csv'),
                  mode='r',
                  encoding='utf-8') as f:
            for line in f:
                extracted = self._extract_text_and_path(line, index)
                if extracted is not None:
                    yield (index, extracted)
                    index += 1

    def _process_source_and_target(self, paths: TextAndAudioPath):
        wav = self.audio.load_wav(paths.wav_path)
        n_samples = len(wav)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        n_frames = mel_spectrogram.shape[0]
        filename = f"{paths.key}.tfrecord"
        filepath = os.path.join(self.out_dir, filename)
        tfrecord.write_preprocessed_data(paths.id, paths.key, wav,
                                         mel_spectrogram, paths.text, filepath)
        return SourceAndTargetMetaData(paths.id, paths.key, n_samples,
                                       n_frames, filepath)

    def _process_mel(self, paths: TextAndAudioPath):
        wav = self.audio.load_wav(paths.wav_path)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        sum_mel_powers = np.sum(mel_spectrogram, axis=1)
        n_frames = mel_spectrogram.shape[0]
        return MelMetaData(n_frames, sum_mel_powers)
class LJSpeech:
    def __init__(self, in_dir, mel_out_dir, wav_out_dir, hparams):
        self.in_dir = in_dir
        self.mel_out_dir = mel_out_dir
        self.wav_out_dir = wav_out_dir
        self.audio = Audio(hparams)

    @property
    def record_ids(self):
        return map(lambda v: str(v), range(1, 13101))

    def record_file_path(self, record_id, kind):
        assert kind in ["source", "target"]
        return os.path.join(self.mel_out_dir,
                            f"ljspeech-{kind}-{int(record_id):05d}.tfrecord")

    def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(self._extract_all_text_and_path())

    def process_wav(self, rdd: RDD):
        return rdd.mapValues(self._process_wav)

    def _extract_text_and_path(self, line, index):
        parts = line.strip().split('|')
        key = parts[0]
        text = parts[2]
        wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key)
        return TextAndPath(index, key, wav_path, None, text)

    def _extract_all_text_and_path(self):
        with open(os.path.join(self.in_dir, 'metadata.csv'),
                  mode='r',
                  encoding='utf-8') as f:
            for index, line in enumerate(f):
                extracted = self._extract_text_and_path(line, index)
                if extracted is not None:
                    yield (index, extracted)

    def _process_wav(self, paths: TextAndPath):
        wav = self.audio.load_wav(paths.wav_path)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        mel_spectrogram = self.audio.normalize_mel(mel_spectrogram)

        mel_filepath = os.path.join(self.mel_out_dir, f"{paths.key}.mfbsp")
        wav_filepath = os.path.join(self.wav_out_dir, f"{paths.key}.wav")

        mel_spectrogram.tofile(mel_filepath, format="<f4")
        self.audio.save_wav(wav, wav_filepath)
class VCTK:
    def __init__(self,
                 in_dir,
                 out_dir,
                 hparams,
                 speaker_info_filename='speaker-info.txt'):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.speaker_info_filename = speaker_info_filename
        self.audio = Audio(hparams)

    def list_files(self):
        def wav_files(speaker_info: SpeakerInfo):
            wav_dir = os.path.join(self.in_dir, f"wav48/p{speaker_info.id}")
            return [
                os.path.join(wav_dir, wav_file)
                for wav_file in sorted(os.listdir(wav_dir))
                if wav_file.endswith('.wav')
            ]

        def text_files(speaker_info: SpeakerInfo):
            txt_dir = os.path.join(self.in_dir, f"txt/p{speaker_info.id}")
            return [
                os.path.join(txt_dir, txt_file)
                for txt_file in sorted(os.listdir(txt_dir))
                if txt_file.endswith('.txt')
            ]

        def text_and_wav_records(file_pairs, speaker_info):
            def create_record(txt_f, wav_f, speaker_info):
                key1 = os.path.basename(wav_f).strip('.wav')
                key2 = os.path.basename(txt_f).strip('.txt')
                assert key1 == key2
                return TxtWavRecord(0, key1, txt_f, wav_f, speaker_info)

            return [
                create_record(txt_f, wav_f, speaker_info)
                for txt_f, wav_f in file_pairs
            ]

        records = sum([
            text_and_wav_records(zip(text_files(si), wav_files(si)), si)
            for si in self._load_speaker_info()
        ], [])
        return [
            TxtWavRecord(i, r.key, r.txt_path, r.wav_path, r.speaker_info)
            for i, r in enumerate(records)
        ]

    def process_sources(self, rdd: RDD):
        return rdd.map(self._process_txt)

    def process_targets(self, rdd: RDD):
        return TargetRDD(
            rdd.map(self._process_wav).persist(StorageLevel.MEMORY_AND_DISK))

    def _load_speaker_info(self):
        with open(os.path.join(self.in_dir, self.speaker_info_filename),
                  mode='r',
                  encoding='utf8') as f:
            for l in f.readlines()[1:]:
                si = l.split()
                gender = 0 if si[2] == 'F' else 1
                if str(si[0]) != "315":  # FixMe: Why 315 is missing?
                    yield SpeakerInfo(int(si[0]), int(si[1]), gender)

    def _process_wav(self, record: TxtWavRecord):
        wav = self.audio.load_wav(record.wav_path)
        wav = self.audio.trim(wav)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        file_path = os.path.join(self.out_dir, f"{record.key}.target.tfrecord")
        write_preprocessed_target_data(record.id, record.key, mel_spectrogram,
                                       file_path)
        return MelStatistics(id=record.id,
                             key=record.key,
                             min=np.min(mel_spectrogram, axis=0),
                             max=np.max(mel_spectrogram, axis=0),
                             sum=np.sum(mel_spectrogram, axis=0),
                             length=len(mel_spectrogram),
                             moment2=np.sum(np.square(mel_spectrogram),
                                            axis=0))

    def _process_txt(self, record: TxtWavRecord):
        with open(os.path.join(self.in_dir, record.txt_path),
                  mode='r',
                  encoding='utf8') as f:
            txt = f.readline().rstrip("\n")
            sequence, clean_text = text_to_sequence(txt, basic_cleaners)
            source = np.array(sequence, dtype=np.int64)
            file_path = os.path.join(self.out_dir,
                                     f"{record.key}.source.tfrecord")
            write_preprocessed_source_data(record.id, record.key, source,
                                           clean_text, record.speaker_info.id,
                                           record.speaker_info.age,
                                           record.speaker_info.gender,
                                           file_path)
            return record.key
Beispiel #4
0
class LJSpeech:
    def __init__(self, in_dir, out_dir, hparams):
        self.in_dir = in_dir
        self.out_dir = out_dir
        self.audio = Audio(hparams)

    @property
    def record_ids(self):
        return map(lambda v: str(v), range(1, 13101))

    def record_file_path(self, record_id, kind):
        assert kind in ["source", "target"]
        return os.path.join(self.out_dir,
                            f"ljspeech-{kind}-{int(record_id):05d}.tfrecord")

    def text_and_path_rdd(self, sc: SparkContext):
        return sc.parallelize(self._extract_all_text_and_path())

    def process_targets(self, rdd: RDD):
        return TargetRDD(
            rdd.mapValues(self._process_target).persist(
                StorageLevel.MEMORY_AND_DISK))

    def process_sources(self, rdd: RDD):
        return rdd.mapValues(self._process_source)

    def _extract_text_and_path(self, line, index):
        parts = line.strip().split('|')
        key = parts[0]
        text = parts[2]
        wav_path = os.path.join(self.in_dir, 'wavs', '%s.wav' % key)
        return TextAndPath(index, key, wav_path, None, text)

    def _extract_all_text_and_path(self):
        with open(os.path.join(self.in_dir, 'metadata.csv'),
                  mode='r',
                  encoding='utf-8') as f:
            for index, line in enumerate(f):
                extracted = self._extract_text_and_path(line, index)
                if extracted is not None:
                    yield (index, extracted)

    def _text_to_sequence(self, text):
        sequence, clean_text = text_to_sequence(text, english_cleaners)
        sequence = np.array(sequence, dtype=np.int64)
        return sequence, clean_text

    def _process_target(self, paths: TextAndPath):
        wav = self.audio.load_wav(paths.wav_path)
        mel_spectrogram = self.audio.melspectrogram(wav).astype(np.float32).T
        filename = f"{paths.key}.target.tfrecord"
        filepath = os.path.join(self.out_dir, filename)
        write_preprocessed_target_data(paths.id, paths.key, mel_spectrogram,
                                       filepath)
        return MelStatistics(id=paths.id,
                             key=paths.key,
                             min=np.min(mel_spectrogram, axis=0),
                             max=np.max(mel_spectrogram, axis=0),
                             sum=np.sum(mel_spectrogram, axis=0),
                             length=len(mel_spectrogram),
                             moment2=np.sum(np.square(mel_spectrogram),
                                            axis=0))

    def _process_source(self, paths: TextAndPath):
        sequence, clean_text = self._text_to_sequence(paths.text)
        filename = f"{paths.key}.source.tfrecord"
        filepath = os.path.join(self.out_dir, filename)
        write_preprocessed_source_data(paths.id, paths.key, sequence,
                                       clean_text, filepath)
        return paths.key
class Synthesizer():
    def __init__(self, model_path, out_dir, text_file, sil_file,
                 use_griffin_lim, gen_wavenet_fea, hparams):
        self.out_dir = out_dir
        self.text_file = text_file
        self.sil_file = sil_file
        self.use_griffin_lim = use_griffin_lim
        self.gen_wavenet_fea = gen_wavenet_fea
        self.hparams = hparams

        self.model = get_model(model_path, hparams)
        self.audio_class = Audio(hparams)

        if hparams.use_phone:
            from text.phones import Phones
            phone_class = Phones(hparams.phone_set_file)
            self.text_to_sequence = phone_class.text_to_sequence
        else:
            from text import text_to_sequence
            self.text_to_sequence = text_to_sequence

        if hparams.is_multi_speakers and not hparams.use_pretrained_spkemb:
            self.speaker_id_dict = gen_speaker_id_dict(hparams)

        self.out_png_dir = os.path.join(self.out_dir, 'png')
        os.makedirs(self.out_png_dir, exist_ok=True)
        if self.use_griffin_lim:
            self.out_wav_dir = os.path.join(self.out_dir, 'wav')
            os.makedirs(self.out_wav_dir, exist_ok=True)
        if self.gen_wavenet_fea:
            self.out_mel_dir = os.path.join(self.out_dir, 'mel')
            os.makedirs(self.out_mel_dir, exist_ok=True)

    def get_mel_gt(self, wavname):
        hparams = self.hparams
        if not hparams.load_mel:
            if hparams.use_hdf5:
                with h5py.File(hparams.hdf5_file, 'r') as h5:
                    data = h5[wavname][:]
            else:
                filename = os.path.join(hparams.wav_dir, wavname + '.wav')
                sr_t, audio = wavread(filename)
                assert sr_t == hparams.sample_rate
            audio_norm = audio / hparams.max_wav_value
            wav = self.audio_class._preemphasize(audio_norm)
            melspec = self.audio_class.melspectrogram(wav, clip_norm=True)
            melspec = torch.FloatTensor(melspec.astype(np.float32))
        else:
            if hparams.use_zip:
                with zipfile.ZipFile(hparams.zip_path, 'r') as f:
                    data = f.read(wavname)
                    melspec = np.load(io.BytesIO(data))
                melspec = torch.FloatTensor(melspec.astype(np.float32))
            elif hparams.use_hdf5:
                with h5py.File(hparams.hdf5_file, 'r') as h5:
                    melspec = h5[wavname][:]
                melspec = torch.FloatTensor(melspec.astype(np.float32))
            else:
                filename = os.path.join(hparams.wav_dir, wavname + '.npy')
                melspec = torch.from_numpy(np.load(filename))
        melspec = torch.unsqueeze(melspec, 0)
        return melspec

    def get_inputs(self, meta_data):
        hparams = self.hparams
        # Prepare text input
        # filename = meta_data['n']
        # filename = os.path.splitext(os.path.basename(filename))[0]
        filename = meta_data[0].strip().split('|')[0]
        print(meta_data[0].strip().split('|')[-1])
        print(meta_data[0].strip().split('|')[1])
        sequence = np.array(
            self.text_to_sequence(meta_data[0].strip().split('|')[-1],
                                  ['english_cleaners']))  # [None, :]
        # sequence = torch.autograd.Variable(
        #     torch.from_numpy(sequence)).cuda().long()
        print(sequence)
        sequence = torch.autograd.Variable(
            torch.from_numpy(sequence)).to(device).long()

        if hparams.is_multi_speakers:
            if hparams.use_pretrained_spkemb:
                ref_file = meta_data['r']
                spk_embedding = np.array(np.load(ref_file))
                spk_embedding = torch.autograd.Variable(
                    torch.from_numpy(spk_embedding)).to(device).float()
                inputs = (sequence, spk_embedding)
            else:
                speaker_name = filename.split('_')[0]
                speaker_id = self.speaker_id_dict[speaker_name]
                speaker_id = np.array([speaker_id])
                # speaker_id = torch.autograd.Variable(
                #     torch.from_numpy(speaker_id)).cuda().long()
                speaker_id = torch.autograd.Variable(
                    torch.from_numpy(speaker_id)).to(device).long()
                inputs = (sequence, speaker_id)

        if hparams.is_multi_styles:
            style_id = np.array([int(meta_data[0].strip().split('|')[1])])
            style_id = torch.autograd.Variable(
                torch.from_numpy(style_id)).to(device).long()
            inputs = (sequence, style_id)

        elif hparams.use_vqvae:
            ref_file = meta_data['r']
            spk_ref = self.get_mel_gt(ref_file)
            inputs = (sequence, spk_ref)
        else:
            inputs = (sequence)

        return inputs, filename

    def gen_mel(self, meta_data):
        inputs, filename = self.get_inputs(meta_data)
        speaker_id = None
        style_id = None
        spk_embedding = None
        spk_ref = None
        if self.hparams.is_multi_speakers:
            if self.hparams.use_pretrained_spkemb:
                sequence, spk_embedding = inputs
            else:
                sequence, speaker_id = inputs
        elif hparams.use_vqvae:
            sequence, spk_ref = inputs
        else:
            sequence = inputs

        if self.hparams.is_multi_styles:
            sequence, style_id = inputs

        # Decode text input and plot results
        with torch.no_grad():
            mel_outputs, gate_outputs, att_ws = self.model.inference(
                sequence,
                self.hparams,
                spk_id=speaker_id,
                style_id=style_id,
                spemb=spk_embedding,
                spk_ref=spk_ref)

            duration_list = DurationCalculator._calculate_duration(att_ws)
            print('att_ws.shape=', att_ws.shape)
            print('duration=', duration_list)
            print('duration_sum=', torch.sum(duration_list))
            print('focus_rete=',
                  DurationCalculator._calculate_focus_rete(att_ws))
            # print(mel_outputs.shape) # (length, dim)

            mel_outputs = mel_outputs.transpose(
                0, 1).float().data.cpu().numpy()  # (dim, length)
            mel_outputs_with_duration = get_duration_matrix(
                char_text_dir=self.text_file,
                duration_tensor=duration_list,
                save_mode='phone').transpose(0, 1).float().data.cpu().numpy()
            gate_outputs = gate_outputs.float().data.cpu().numpy()
            att_ws = att_ws.float().data.cpu().numpy()

        image_path = os.path.join(self.out_png_dir,
                                  "{}_att.png".format(filename))
        _plot_and_save(att_ws, image_path)

        image_path = os.path.join(self.out_png_dir,
                                  "{}_spec_stop.png".format(filename))
        fig, axes = plt.subplots(3, 1, figsize=(8, 8))
        axes[0].imshow(mel_outputs,
                       aspect='auto',
                       origin='bottom',
                       interpolation='none')
        axes[1].imshow(mel_outputs_with_duration,
                       aspect='auto',
                       origin='bottom',
                       interpolation='none')
        axes[2].scatter(range(len(gate_outputs)),
                        gate_outputs,
                        alpha=0.5,
                        color='red',
                        marker='.',
                        s=5,
                        label='predicted')
        plt.savefig(image_path, format='png')
        plt.close()

        return mel_outputs, filename

    def gen_wav_griffin_lim(self, mel_outputs, filename):
        grf_wav = self.audio_class.inv_mel_spectrogram(mel_outputs)
        grf_wav = self.audio_class.inv_preemphasize(grf_wav)
        wav_path = os.path.join(self.out_wav_dir, "{}-gl.wav".format(filename))
        self.audio_class.save_wav(grf_wav, wav_path)

    def gen_wavenet_feature(self, mel_outputs, filename, add_end_sil=True):
        # denormalize
        mel = self.audio_class._denormalize(mel_outputs)
        # normalize to 0-1
        mel = np.clip(((mel - self.audio_class.hparams.min_level_db) /
                       (-self.audio_class.hparams.min_level_db)), 0, 1)

        mel = mel.T.astype(np.float32)

        frame_size = 200
        SILSEG = 0.3
        SAMPLING = 16000
        sil_samples = int(SILSEG * SAMPLING)
        sil_frames = int(sil_samples / frame_size)
        sil_data, _ = soundfile.read(self.sil_file)
        sil_data = sil_data[:sil_samples]

        sil_mel_spec, _ = self.audio_class._magnitude_spectrogram(
            sil_data, clip_norm=True)
        sil_mel_spec = (sil_mel_spec + 4.0) / 8.0

        pad_mel_data = np.concatenate((sil_mel_spec[:sil_frames], mel), axis=0)
        if add_end_sil:
            pad_mel_data = np.concatenate(
                (pad_mel_data, sil_mel_spec[:sil_frames]), axis=0)
        out_mel_file = os.path.join(self.out_mel_dir,
                                    '{}-wn.mel'.format(filename))
        save_htk_data(pad_mel_data, out_mel_file)

    def inference_f(self):
        # print(meta_data['n'])
        meta_data = _read_meta_yyh(self.text_file)
        mel_outputs, filename = self.gen_mel(meta_data)
        print('my_mel_outputs=', mel_outputs)
        print('my_mel_outputs_max=', np.max(mel_outputs))
        print('my_mel_outputs_min=', np.min(mel_outputs))
        mel_outputs = np.load(r'../out_0.npy').transpose(1, 0)
        mel_outputs = mel_outputs * 8.0 - 4.0
        print('his_mel_outputs=', mel_outputs)
        print('his_mel_outputs_max=', np.max(mel_outputs))
        print('his_mel_outputs_min=', np.min(mel_outputs))
        if self.use_griffin_lim:
            self.gen_wav_griffin_lim(mel_outputs, filename)
        if self.gen_wavenet_fea:
            self.gen_wavenet_feature(mel_outputs, filename)
        return filename

    def inference(self):
        all_meta_data = _read_meta(self.text_file, hparams.meta_format)
        list(map(self.inference_f, all_meta_data))