Exemple #1
0
    def preprocess(self, num_workers=4, mode=SOURCE_AND_TARGET):
        if not os.path.exists(self.out_dir):
            os.makedirs(self.out_dir)
        executor = ProcessPoolExecutor(max_workers=num_workers)
        if mode in [TARGET_ONLY, SOURCE_AND_TARGET]:
            futures = []
            wav_paths = jsut.WavFileDataSource(
                self.in_dir, subsets=jsut.available_subsets).collect_files()

            for index, wav_path in enumerate(wav_paths):
                futures.append(
                    executor.submit(
                        partial(_process_audio, self.out_dir, index + 1,
                                wav_path)))
            result = [
                future.result() for future in tqdm(futures, desc="targets")
            ]
            self._write_target_metadata(result)
        if mode in [SOURCE_ONLY, SOURCE_AND_TARGET]:
            futures = []
            transcriptions = jsut.TranscriptionDataSource(
                self.in_dir, subsets=jsut.available_subsets).collect_files()
            for index, text in enumerate(transcriptions):
                futures.append(
                    executor.submit(
                        partial(_process_text, self.out_dir, index + 1, text)))
            result = [
                future.result() for future in tqdm(futures, desc="sources")
            ]
            self._write_source_metadata(result)
        executor.shutdown()
Exemple #2
0
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
    executor = ProcessPoolExecutor(max_workers=num_workers)
    futures = []

    transcriptions = jsut.TranscriptionDataSource(
        in_dir, subsets=jsut.available_subsets).collect_files()
    wav_paths = jsut.WavFileDataSource(
        in_dir, subsets=jsut.available_subsets).collect_files()

    for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)):
        futures.append(executor.submit(
            partial(_process_utterance, out_dir, index + 1, wav_path, text)))
    return [future.result() for future in tqdm(futures)]
def test_ja_jsut():
    f = getattr(frontend, "jp")
    from nnmnkwii.datasets import jsut
    from tqdm import trange
    import jaconv

    d = jsut.TranscriptionDataSource("/home/ryuichi/data/jsut_ver1.1/",
                                     subsets=jsut.available_subsets)
    texts = d.collect_files()

    for p in [0.0, 0.5, 1.0]:
        for idx in trange(len(texts)):
            text = texts[idx]
            seq = f.text_to_sequence(text, p=p)
            assert seq[-1] == eos
            t = f.sequence_to_text(seq)

            if idx < 10:
                print("""{0}: {1}\n{0}: {2}\n""".format(idx, text, t))