def preprocess(self, num_workers=4, mode=SOURCE_AND_TARGET): if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) executor = ProcessPoolExecutor(max_workers=num_workers) if mode in [TARGET_ONLY, SOURCE_AND_TARGET]: futures = [] wav_paths = jsut.WavFileDataSource( self.in_dir, subsets=jsut.available_subsets).collect_files() for index, wav_path in enumerate(wav_paths): futures.append( executor.submit( partial(_process_audio, self.out_dir, index + 1, wav_path))) result = [ future.result() for future in tqdm(futures, desc="targets") ] self._write_target_metadata(result) if mode in [SOURCE_ONLY, SOURCE_AND_TARGET]: futures = [] transcriptions = jsut.TranscriptionDataSource( self.in_dir, subsets=jsut.available_subsets).collect_files() for index, text in enumerate(transcriptions): futures.append( executor.submit( partial(_process_text, self.out_dir, index + 1, text))) result = [ future.result() for future in tqdm(futures, desc="sources") ] self._write_source_metadata(result) executor.shutdown()
def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x): executor = ProcessPoolExecutor(max_workers=num_workers) futures = [] transcriptions = jsut.TranscriptionDataSource( in_dir, subsets=jsut.available_subsets).collect_files() wav_paths = jsut.WavFileDataSource( in_dir, subsets=jsut.available_subsets).collect_files() for index, (text, wav_path) in enumerate(zip(transcriptions, wav_paths)): futures.append(executor.submit( partial(_process_utterance, out_dir, index + 1, wav_path, text))) return [future.result() for future in tqdm(futures)]