def parse_libri_folder(libri_folders): # parsing librispeech # step 1: we construct a dictionary with speakers ids utterances = [] txt_files = [] for libri_dir in libri_folders: utterances.extend(get_all_files(libri_dir, match_and=[".flac"])) txt_files.extend(get_all_files(libri_dir, match_and=["trans.txt"])) # step 2: we then build an hashtable for words for each utterance words_dict = {} for trans in txt_files: with open(trans, "r") as f: for line in f: splitted = line.split(" ") utt_id = splitted[0] words = " ".join(splitted[1:]) words_dict[utt_id] = words.strip("\n") # step 3: we build an hashtable for also speakers speakers = {} for u in utterances: spk_id = Path(u).parent.parent.stem if spk_id not in speakers: speakers[spk_id] = [u] else: speakers[spk_id].append(u) return speakers, words_dict
def prepare_mini_librispeech( data_folder, save_json_train, save_json_valid, save_json_test ): """ Prepares the json files for the Mini Librispeech dataset. Downloads the dataset if its not found in the `data_folder`. Arguments --------- data_folder : str Path to the folder where the Mini Librispeech dataset is stored. save_json_train : str Path where the train data specification file will be saved. save_json_valid : str Path where the validation data specification file will be saved. save_json_test : str Path where the test data specification file will be saved. Example ------- >>> data_folder = '/path/to/mini_librispeech' >>> prepare_mini_librispeech(data_folder, 'train.json', 'valid.json', 'test.json') """ # Check if this phase is already done (if so, skip it) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed in previous run, skipping.") return # If the dataset doesn't exist yet, download it train_folder = os.path.join(data_folder, "LibriSpeech", "train-clean-5") valid_folder = os.path.join(data_folder, "LibriSpeech", "dev-clean-2") test_folder = os.path.join(data_folder, "LibriSpeech", "test-clean") if not check_folders(train_folder, valid_folder, test_folder): download_mini_librispeech(data_folder) # List files and create manifest from list logger.info( f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}" ) extension = [".flac"] # List of flac audio files wav_list_train = get_all_files(train_folder, match_and=extension) wav_list_valid = get_all_files(valid_folder, match_and=extension) wav_list_test = get_all_files(test_folder, match_and=extension) # List of transcription file extension = [".trans.txt"] trans_list = get_all_files(data_folder, match_and=extension) trans_dict = get_transcription(trans_list) # Create the json files create_json(wav_list_train, trans_dict, save_json_train) create_json(wav_list_valid, trans_dict, save_json_valid) create_json(wav_list_test, trans_dict, save_json_test)
def resample_folder(input_folder, output_folder, fs, regex): files = get_all_files(input_folder, match_and=[regex]) torchaudio.initialize_sox() for f in tqdm.tqdm(files): # we use sox because torchaudio.Resample uses too much RAM. resample = torchaudio.sox_effects.SoxEffectsChain() resample.append_effect_to_chain("rate", [fs]) resample.set_input_file(f) audio, fs = resample.sox_build_flow_effects() audio = (audio / torch.max(torch.abs(audio), dim=-1, keepdim=True)[0] ) # scale back otherwise you get empty .wav file os.makedirs( Path( os.path.join(output_folder, Path(f).relative_to(Path(input_folder)))).parent, exist_ok=True, ) torchaudio.save( os.path.join(output_folder, Path(f).relative_to(Path(input_folder))), audio, fs, ) torchaudio.shutdown_sox()
def prepare_mini_librispeech( data_folder, save_json_train, save_json_valid, save_json_test, split_ratio=[80, 10, 10], ): """ Prepares the json files for the Mini Librispeech dataset. Downloads the dataset if it is not found in the `data_folder`. Arguments --------- data_folder : str Path to the folder where the Mini Librispeech dataset is stored. save_json_train : str Path where the train data specification file will be saved. save_json_valid : str Path where the validation data specification file will be saved. save_json_test : str Path where the test data specification file will be saved. split_ratio: list List composed of three integers that sets split ratios for train, valid, and test sets, respecively. For instance split_ratio=[80, 10, 10] will assign 80% of the sentences to training, 10% for validation, and 10% for test. Example ------- >>> data_folder = '/path/to/mini_librispeech' >>> prepare_mini_librispeech(data_folder, 'train.json', 'valid.json', 'test.json') """ # Check if this phase is already done (if so, skip it) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed in previous run, skipping.") return # If the dataset doesn't exist yet, download it train_folder = os.path.join(data_folder, "LibriSpeech", "train-clean-5") if not check_folders(train_folder): download_mini_librispeech(data_folder) # List files and create manifest from list logger.info( f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}" ) extension = [".flac"] wav_list = get_all_files(train_folder, match_and=extension) # Random split the signal list into train, valid, and test sets. data_split = split_sets(wav_list, split_ratio) # Creating json files create_json(data_split["train"], save_json_train) create_json(data_split["valid"], save_json_valid) create_json(data_split["test"], save_json_test)
def get_transcription_files_by_dataset(dataset: str, transcription_folder: str) -> List[str]: """return paths of transcriptions from the given data set and the path of all of transcriptions""" train_set = get_data_list(f"splits/{dataset}") transcription_train_set = list( map(lambda path: path.split(".")[0].strip(), train_set)) transcription_train_set = list( map(lambda path: f"{path}.tdf", transcription_train_set)) transcription_files = get_all_files(transcription_folder, match_or=transcription_train_set) return transcription_files
def prepare_commonlanguage(folder, csv_file, max_noise_len=None): """Prepare the CommonLanguage dataset for VAD training. Arguments --------- folder : str The location of the folder containing the dataset. csv_file : str Filename for storing the prepared csv file. max_noise_len : float The maximum noise length in seconds. Noises longer than this will be cut into pieces. """ logger.info("CommonLanguage Preparation...") wav_lst = get_all_files(os.path.join(folder), match_and=[".wav"]) if not os.path.isfile(csv_file): logger.info(csv_file + " creation...") _prepare_csv(folder, wav_lst, csv_file, max_noise_len)
def create_sets(data_folder, extension): """ Creates lists for train, dev and test sets with data from the data_folder Arguments --------- data_folder : str Path of the CommonLanguage dataset. extension: list of file extentions List of strings with file extentions that correspond to the audio files in the CommonLanguage dataset Returns ------- dictionary containing train, dev, and test splits. """ # Datasets initialization datasets = {"train", "dev", "test"} data_split = {dataset: [] for dataset in datasets} # Get the list of languages from the dataset folder languages = [ name for name in os.listdir(data_folder) if os.path.isdir(os.path.join(data_folder, name)) and datasets.issubset(os.listdir(os.path.join(data_folder, name))) ] msg = f"{len(languages)} languages detected!" logger.info(msg) # Fill the train, dev and test datasets with audio filenames for language in languages: for dataset in datasets: curr_folder = os.path.join(data_folder, language, dataset) wav_list = get_all_files(curr_folder, match_and=extension) data_split[dataset].extend(wav_list) msg = "Data successfully split!" logger.info(msg) return data_split
def prepare_musan(folder, music_csv, noise_csv, speech_csv, max_noise_len=None): """Prepare the musan dataset (music, noise, speech). Arguments --------- folder : str The location of the folder containing the dataset. noise_csv : str Filename for storing the prepared noise csv. max_noise_len : float The maximum noise length in seconds. Noises longer than this will be cut into pieces. """ sub_folders = ["music", "noise", "speech"] csv_files = [music_csv, noise_csv, speech_csv] logger.info("Musan Data Preparation...") for sub_folder, csv_file in zip(sub_folders, csv_files): wav_lst = get_all_files( os.path.join(folder, sub_folder), match_and=[".wav"] ) if not os.path.isfile(csv_file): logger.info(csv_file + " creation...") _prepare_csv(folder, wav_lst, csv_file, max_noise_len)
def download_vctk(destination, tmp_dir=None, device="cpu"): """Download dataset and perform resample to 16000 Hz. Arguments --------- destination : str Place to put final zipped dataset. tmp_dir : str Location to store temporary files. Will use `tempfile` if not provided. device : str Passed directly to pytorch's ``.to()`` method. Used for resampling. """ dataset_name = "noisy-vctk-16k" if tmp_dir is None: tmp_dir = tempfile.gettempdir() final_dir = os.path.join(tmp_dir, dataset_name) if not os.path.isdir(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(final_dir): os.mkdir(final_dir) prefix = "https://datashare.is.ed.ac.uk/bitstream/handle/10283/2791/" noisy_vctk_urls = [ prefix + "clean_testset_wav.zip", prefix + "noisy_testset_wav.zip", prefix + "testset_txt.zip", prefix + "clean_trainset_28spk_wav.zip", prefix + "noisy_trainset_28spk_wav.zip", prefix + "trainset_28spk_txt.zip", ] zip_files = [] for url in noisy_vctk_urls: filename = os.path.join(tmp_dir, url.split("/")[-1]) zip_files.append(filename) if not os.path.isfile(filename): logger.info("Downloading " + url) with urllib.request.urlopen(url) as response: with open(filename, "wb") as tmp_file: logger.info("... to " + tmp_file.name) shutil.copyfileobj(response, tmp_file) # Unzip for zip_file in zip_files: logger.info("Unzipping " + zip_file) shutil.unpack_archive(zip_file, tmp_dir, "zip") os.remove(zip_file) # Move transcripts to final dir shutil.move(os.path.join(tmp_dir, "testset_txt"), final_dir) shutil.move(os.path.join(tmp_dir, "trainset_28spk_txt"), final_dir) # Downsample dirs = [ "noisy_testset_wav", "clean_testset_wav", "noisy_trainset_28spk_wav", "clean_trainset_28spk_wav", ] downsampler = Resample(orig_freq=48000, new_freq=16000) for directory in dirs: logger.info("Resampling " + directory) dirname = os.path.join(tmp_dir, directory) # Make directory to store downsampled files dirname_16k = os.path.join(final_dir, directory + "_16k") if not os.path.isdir(dirname_16k): os.mkdir(dirname_16k) # Load files and downsample for filename in get_all_files(dirname, match_and=[".wav"]): signal, rate = torchaudio.load(filename) downsampled_signal = downsampler(signal.view(1, -1).to(device)) # Save downsampled file torchaudio.save( os.path.join(dirname_16k, filename[-12:]), downsampled_signal[0].cpu(), sample_rate=16000, channels_first=False, ) # Remove old file os.remove(filename) # Remove old directory os.rmdir(dirname) logger.info("Zipping " + final_dir) final_zip = shutil.make_archive( base_name=final_dir, format="zip", root_dir=os.path.dirname(final_dir), base_dir=os.path.basename(final_dir), ) logger.info(f"Moving {final_zip} to {destination}") shutil.move(final_zip, os.path.join(destination, dataset_name + ".zip"))
def prepare_voicebank(data_folder, save_folder, valid_speaker_count=2, skip_prep=False): """ Prepares the json files for the Voicebank dataset. Expects the data folder to be the same format as the output of ``download_vctk()`` below. Arguments --------- data_folder : str Path to the folder where the original Voicebank dataset is stored. save_folder : str The directory where to store the json files. valid_speaker_count : int The number of validation speakers to use (out of 28 in train set). skip_prep: bool If True, skip data preparation. Example ------- >>> data_folder = '/path/to/datasets/Voicebank' >>> save_folder = 'exp/Voicebank_exp' >>> prepare_voicebank(data_folder, save_folder) """ if skip_prep: return # Setting ouput files save_json_train = os.path.join(save_folder, TRAIN_JSON) save_json_valid = os.path.join(save_folder, VALID_JSON) save_json_test = os.path.join(save_folder, TEST_JSON) # Check if this phase is already done (if so, skip it) if skip(save_json_train, save_json_test, save_json_valid): logger.info("Preparation completed in previous run, skipping.") return train_clean_folder = os.path.join(data_folder, "clean_trainset_28spk_wav_16k") train_noisy_folder = os.path.join(data_folder, "noisy_trainset_28spk_wav_16k") train_txts = os.path.join(data_folder, "trainset_28spk_txt") test_clean_folder = os.path.join(data_folder, "clean_testset_wav_16k") test_noisy_folder = os.path.join(data_folder, "noisy_testset_wav_16k") test_txts = os.path.join(data_folder, "testset_txt") # Setting the save folder if not os.path.exists(save_folder): os.makedirs(save_folder) # Additional checks to make sure the data folder contains Voicebank check_voicebank_folders( train_clean_folder, train_noisy_folder, train_txts, test_clean_folder, test_noisy_folder, test_txts, ) logger.debug("Creating lexicon...") lexicon = create_lexicon(os.path.join(data_folder, "lexicon.txt")) logger.info("Creating json files for noisy VoiceBank...") logger.debug("Collecting files...") extension = [".wav"] valid_speakers = TRAIN_SPEAKERS[:valid_speaker_count] wav_lst_train = get_all_files( train_noisy_folder, match_and=extension, exclude_or=valid_speakers, ) wav_lst_valid = get_all_files( train_noisy_folder, match_and=extension, match_or=valid_speakers, ) wav_lst_test = get_all_files(test_noisy_folder, match_and=extension) logger.debug("Creating json files for noisy VoiceBank...") create_json(wav_lst_train, save_json_train, train_clean_folder, train_txts, lexicon) create_json(wav_lst_valid, save_json_valid, train_clean_folder, train_txts, lexicon) create_json(wav_lst_test, save_json_test, test_clean_folder, test_txts, lexicon)
def prepare_timit( data_folder, save_json_train, save_json_valid, save_json_test, phn_set=39, uppercase=False, skip_prep=False, ): """ repares the json files for the TIMIT dataset. Arguments --------- data_folder : str Path to the folder where the original TIMIT dataset is stored. save_json_train : str The path where to store the training json file. save_json_valid : str The path where to store the valid json file. save_json_test : str The path where to store the test json file. phn_set : {60, 48, 39}, optional, Default: 39 The phoneme set to use in the phn label. It could be composed of 60, 48, or 39 phonemes. uppercase : bool, optional Default: False This option must be True when the TIMIT dataset is in the upper-case version. skip_prep: bool Default: False If True, the data preparation is skipped. Example ------- >>> from recipes.TIMIT.timit_prepare import prepare_timit >>> data_folder = 'datasets/TIMIT' >>> prepare_timit(data_folder, 'train.json', 'valid.json', 'test.json') """ # Skip if needed if skip_prep: return # Getting speaker dictionary dev_spk, test_spk = _get_speaker() avoid_sentences = ["sa1", "sa2"] extension = [".wav"] # Checking TIMIT_uppercase if uppercase: avoid_sentences = [item.upper() for item in avoid_sentences] extension = [item.upper() for item in extension] dev_spk = [item.upper() for item in dev_spk] test_spk = [item.upper() for item in test_spk] # Check if this phase is already done (if so, skip it) if skip([save_json_train, save_json_valid, save_json_test]): logger.info("Skipping preparation, completed in previous run.") return # Additional checks to make sure the data folder contains TIMIT _check_timit_folders(uppercase, data_folder) msg = "Creating json files for the TIMIT Dataset.." logger.info(msg) # Creating json files splits = ["train", "test", "test"] annotations = [save_json_train, save_json_valid, save_json_test] match_or = [None, dev_spk, test_spk] for split, save_file, match in zip(splits, annotations, match_or): if uppercase: match_lst = extension + [split.upper()] else: match_lst = extension + [split] # List of the wav files wav_lst = get_all_files( data_folder, match_and=match_lst, match_or=match, exclude_or=avoid_sentences, ) if split == "dev": print(wav_lst) # Json creation create_json(wav_lst, save_file, uppercase, phn_set)
def prepare_dns( data_folder, save_folder, seg_size=10.0, valid_folder=None, valid_ratio=0.002, valid_snr_low=0, valid_snr_high=40, skip_prep=False, ): """ Prepares the csv files for the DNS challenge dataset. Arguments --------- data_folder : str Path to the folder where the original DNS dataset is stored. save_folder : str The directory where to store the csv files. seg_size : float Split the file into multiple fix length segments (ms). valid_ratio : float Use this fraction of the training data as a validation set. valid_folder : str Location for storing mixed validation samples. valid_snr_low : float Lowest SNR to use when mixing the validation set. valid_snr_high : float Highest SNR to use when mixing the validiation set. skip_prep: bool If False, skip data preparation. Example ------- >>> # This example requires the actual DNS dataset: >>> data_folder = 'datasets/DNS-Challenge' >>> save_folder = 'DNS_prepared' >>> prepare_dns(data_folder, save_folder) """ if skip_prep: return if valid_ratio > 0 and valid_folder is None: raise ValueError("Must provide folder for storing validation data") # Additional checks to make sure the data folder contains DNS _check_DNS_folders(data_folder) train_folder = os.path.join(data_folder, "datasets") test_folder = os.path.join(data_folder, "datasets", "test_set", "synthetic", "no_reverb") # Setting file extension. extension = [".wav"] # Setting the save folder if not os.path.exists(save_folder): os.makedirs(save_folder) # Check if this phase is already done (if so, skip it) if skip(save_folder): logger.info("Preparation completed in previous run.") return logger.info("Creating csv files for the DNS Dataset...") # Setting ouput files save_csv_noise = os.path.join(save_folder, NOISE_CSV) save_csv_clean = os.path.join(save_folder, CLEAN_CSV) save_csv_valid = os.path.join(save_folder, VALID_CSV) save_csv_test = os.path.join(save_folder, TEST_CSV) # Get the list of files wav_lst_noise = get_all_files(os.path.join(train_folder, "noise"), match_and=extension) wav_lst_clean = get_all_files(os.path.join(train_folder, "clean"), match_and=extension) # Clean is excluded here, but will be picked up by `create_csv` wav_lst_test = get_all_files( test_folder, match_and=extension, exclude_or=["/clean/"], ) # Split training into validation and training if valid_ratio > 0: # Sort to ensure same validation set for each run. wav_lst_noise.sort() wav_lst_clean.sort() # Split valid_count = int(valid_ratio * len(wav_lst_clean)) valid_lst_noise = wav_lst_noise[:valid_count] valid_lst_clean = wav_lst_clean[:valid_count] wav_lst_noise = wav_lst_noise[valid_count:] wav_lst_clean = wav_lst_clean[valid_count:] # Create noise csv to use when adding noise to validation samples. save_valid_noise = os.path.join(save_folder, "valid_noise.csv") create_csv(save_valid_noise, valid_lst_noise) create_csv( save_csv_valid, valid_lst_clean, seg_size=seg_size, noise_csv=save_valid_noise, noisy_folder=valid_folder, noise_snr_low=valid_snr_low, noise_snr_high=valid_snr_high, ) # Test set has target in parallel "clean" directory create_csv(save_csv_test, wav_lst_test, has_target=True) # Create tr_clean.csv and tr_noise.csv for dynamic mixing the training data create_csv(save_csv_noise, wav_lst_noise) create_csv(save_csv_clean, wav_lst_clean, seg_size=seg_size)
def prepare_data( data_folder, save_json_train, save_json_valid, save_json_test, split_ratio=[80, 10, 10], different_speakers=False, seed=12, ): """ Prepares the json files for the IEMOCAP dataset. We here use only the audio part of the dataset. The assumpion is that the data folder is structured as: <session_id>/<emotion>/<file:name>.wav e.g. session1/ang/psno1_ang_s084_orgn.wav Please, process the original IEMOCAP folder to match the expected folder structure. Arguments --------- data_folder : str Path to the folder where the transformed IEMOCAP dataset is stored. save_json_train : str Path where the train data specification file will be saved. save_json_valid : str Path where the validation data specification file will be saved. save_json_test : str Path where the test data specification file will be saved. split_ratio: list List composed of three integers that sets split ratios for train, valid, and test sets, respecively. For instance split_ratio=[80, 10, 10] will assign 80% of the sentences to training, 10% for validation, and 10% for test. seed : int Seed for reproducibility Example ------- >>> data_folder = '/path/to/iemocap' >>> prepare_data(data_path, data_folder, 'train.json', 'valid.json', 'test.json') """ # setting seeds for reproducible code. random.seed(seed) # Check if this phase is already done (if so, skip it) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed in previous run, skipping.") return if not check_folders(data_folder + "session1/ang/psno1_ang_s084_orgn.wav"): logger.info( "The data folder is not in the expected format. Expected <session_id>/<emo_id>/<file_name>.wav (e.g., session1/ang/psno1_ang_s084_orgn.wav)" ) # List files and create manifest from list logger.info( f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}") extension = [".wav"] # Randomly split the signal list into train, valid, and test sets. wav_list = get_all_files(data_folder, match_and=extension) if different_speakers: data_split = split_different_speakers(wav_list) else: data_split = split_sets(wav_list, split_ratio) # Creating json files create_json(data_split["train"], save_json_train) create_json(data_split["valid"], save_json_valid) create_json(data_split["test"], save_json_test)
def prepare_data( data_original, data_transformed, save_json_train, save_json_valid, save_json_test, split_ratio=[80, 10, 10], different_speakers=False, seed=12, ): """ Prepares the json files for the IEMOCAP dataset. We here use only the audio part of the dataset. The assumpion is that the data folder is structured as: <session_id>/<emotion>/<file:name>.wav e.g. session1/ang/psno1_ang_s084_orgn.wav Please, process the original IEMOCAP folder to match the expected folder structure. Arguments --------- data_original : str Path to the folder where the original IEMOCAP dataset is stored. data_transformed : str Path to the folder where the transformed IEMOCAP dataset will be stored. save_json_train : str Path where the train data specification file will be saved. save_json_valid : str Path where the validation data specification file will be saved. save_json_test : str Path where the test data specification file will be saved. split_ratio: list List composed of three integers that sets split ratios for train, valid, and test sets, respecively. For instance split_ratio=[80, 10, 10] will assign 80% of the sentences to training, 10% for validation, and 10% for test. seed : int Seed for reproducibility Example ------- >>> data_original = '/path/to/iemocap/IEMOCAP_full_release/Session' >>> data_transformed = '/path/to/iemocap/IEMOCAP_ahsn_leave-two-speaker-out' >>> prepare_data(data_original, data_transformed, 'train.json', 'valid.json', 'test.json') """ # setting seeds for reproducible code. random.seed(seed) # Check if this phase is already done (if so, skip it) if skip(save_json_train, save_json_valid, save_json_test): logger.info("Preparation completed in previous run, skipping.") return # Check if the transformed data folder exist, generate it otherwise. if not check_folders(data_transformed): logger.info( "The data transformed folder doesn't exist. Do the transformation step." ) transform_data(data_original, data_transformed) else: logger.info("Data Transformation completed in previous run, skipping.") if (not len(list(glob.iglob(data_transformed + "/*/*/*", recursive=True))) == NUMBER_UTT): logger.error( "Error: The data folder is not in the expected format. Expected <session_id>/<emo_id>/<file_name>.wav (e.g., session1/ang/psno1_ang_s084_orgn.wav)" ) sys.exit("Data transformed dirctory " + data_transformed + "contains: " + str( len( list( glob.iglob(data_transformed + "/*/*/*", recursive=True)))) + " file. Expected " + str(NUMBER_UTT) + ".") # List files and create manifest from list logger.info( f"Creating {save_json_train}, {save_json_valid}, and {save_json_test}") extension = [".wav"] # Randomly split the signal list into train, valid, and test sets. wav_list = get_all_files(data_transformed, match_and=extension) if different_speakers: data_split = split_different_speakers(wav_list) else: data_split = split_sets(wav_list, split_ratio) # Creating json files create_json(data_split["train"], save_json_train) create_json(data_split["valid"], save_json_valid) create_json(data_split["test"], save_json_test)
def prepare_timit( data_folder, splits, save_folder, kaldi_ali_tr=None, kaldi_ali_dev=None, kaldi_ali_test=None, kaldi_lab_opts=None, phn_set=39, uppercase=False, ): """ repares the csv files for the TIMIT dataset. Arguments --------- data_folder : str Path to the folder where the original TIMIT dataset is stored. splits : list List of splits to prepare from ['train', 'dev', 'test'] save_folder : str The directory where to store the csv files. kaldi_ali_tr : dict, optional Default: 'None' When set, this is the directiory where the kaldi training alignments are stored. They will be automatically converted into pkl for an easier use within speechbrain. kaldi_ali_dev : str, optional Default: 'None' When set, this is the path to directory where the kaldi dev alignments are stored. kaldi_ali_te : str, optional Default: 'None' When set, this is the path to the directory where the kaldi test alignments are stored. phn_set : {60, 48, 39}, optional, Default: 39 The phoneme set to use in the phn label. It could be composed of 60, 48, or 39 phonemes. uppercase : bool, optional Default: False This option must be True when the TIMIT dataset is in the upper-case version. Example ------- >>> from recipes.TIMIT.timit_prepare import prepare_timit >>> data_folder = 'datasets/TIMIT' >>> splits = ['train', 'dev', 'test'] >>> save_folder = 'TIMIT_prepared' >>> prepare_timit(data_folder, splits, save_folder) """ conf = { "data_folder": data_folder, "splits": splits, "kaldi_ali_tr": kaldi_ali_tr, "kaldi_ali_dev": kaldi_ali_dev, "kaldi_ali_test": kaldi_ali_test, "save_folder": save_folder, "phn_set": phn_set, "uppercase": uppercase, } # Getting speaker dictionary dev_spk, test_spk = _get_speaker() # Avoid calibration sentences avoid_sentences = ["sa1", "sa2"] # Setting file extension. extension = [".wav"] # Checking TIMIT_uppercase if uppercase: avoid_sentences = [item.upper() for item in avoid_sentences] extension = [item.upper() for item in extension] dev_spk = [item.upper() for item in dev_spk] test_spk = [item.upper() for item in test_spk] # Setting the save folder if not os.path.exists(save_folder): os.makedirs(save_folder) # Setting ouput files save_opt = os.path.join(save_folder, OPT_FILE) save_csv_train = os.path.join(save_folder, TRAIN_CSV) save_csv_dev = os.path.join(save_folder, DEV_CSV) save_csv_test = os.path.join(save_folder, TEST_CSV) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return # Additional checks to make sure the data folder contains TIMIT _check_timit_folders(uppercase, data_folder) msg = "Creating csv file for the TIMIT Dataset.." logger.info(msg) # Creating csv file for training data if "train" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TRAIN"] else: match_lst = extension + ["train"] wav_lst_train = get_all_files( data_folder, match_and=match_lst, exclude_or=avoid_sentences, ) create_csv( wav_lst_train, save_csv_train, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_tr, kaldi_lab_opts=kaldi_lab_opts, ) # Creating csv file for dev data if "dev" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TEST"] else: match_lst = extension + ["test"] wav_lst_dev = get_all_files( data_folder, match_and=match_lst, match_or=dev_spk, exclude_or=avoid_sentences, ) create_csv( wav_lst_dev, save_csv_dev, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_dev, kaldi_lab_opts=kaldi_lab_opts, ) # Creating csv file for test data if "test" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TEST"] else: match_lst = extension + ["test"] wav_lst_test = get_all_files( data_folder, match_and=match_lst, match_or=test_spk, exclude_or=avoid_sentences, ) create_csv( wav_lst_test, save_csv_test, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_test, kaldi_lab_opts=kaldi_lab_opts, ) # saving options save_pkl(conf, save_opt)
def prepare_ksponspeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, skip_prep=False, ): """ This class prepares the csv files for the KsponSpeech dataset. Download link: https://aihub.or.kr/aidata/105/download Arguments --------- data_folder : str Path to the folder where the original KsponSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['train', 'dev', 'eval_clean', 'eval_other']. dev_splits : list List of dev splits to prepare from ['dev']. te_splits : list List of test splits to prepare from ['eval_clean','eval_other']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of KsponSpeech splits (e.g, eval_clean, eval_other) to merge in a singe csv file. merge_name: str Name of the merged csv file. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/KsponSpeech' >>> tr_splits = ['train'] >>> dev_splits = ['dev'] >>> te_splits = ['eval_clean'] >>> save_folder = 'KsponSpeech_prepared' >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \ te_splits) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains ksponspeech check_ksponspeech_folders(data_folder, splits) # parse trn file all_texts = {} for split_index in range(len(splits)): split = splits[split_index] dirlist = split2dirs(split) wav_lst = [] for dir in dirlist: wav_lst += get_all_files(os.path.join(data_folder, dir), match_and=[".wav"]) trnpath = os.path.join(data_folder, split + ".trn") text_dict = text_to_dict(trnpath) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_kspon + ".csv" for split_kspon in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # saving options save_pkl(conf, save_opt)
def prepare_librispeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, create_lexicon=False, skip_prep=False, ): """ This class prepares the csv files for the LibriSpeech dataset. Download link: http://www.openslr.org/12 Arguments --------- data_folder : str Path to the folder where the original LibriSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['test-others','train-clean-100', 'train-clean-360','train-other-500']. dev_splits : list List of dev splits to prepare from ['dev-clean','dev-others']. te_splits : list List of test splits to prepare from ['test-clean','test-others']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of librispeech splits (e.g, train-clean, train-clean-360,..) to merge in a singe csv file. merge_name: str Name of the merged csv file. create_lexicon: bool If True, it outputs csv files contaning mapping between graphene to phonemes. Use it for training a G2P system. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/LibriSpeech' >>> splits = ['train-clean-100', 'dev-clean', 'test-clean'] >>> save_folder = 'librispeech_prepared' >>> prepare_librispeech(data_folder, splits, save_folder) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains Librispeech check_librispeech_folders(data_folder, splits) # create csv files for each split all_texts = {} for split_index in range(len(splits)): split = splits[split_index] wav_lst = get_all_files(os.path.join(data_folder, split), match_and=[".flac"]) text_lst = get_all_files(os.path.join(data_folder, split), match_and=["trans.txt"]) text_dict = text_to_dict(text_lst) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_libri + ".csv" for split_libri in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # Create lexicon.csv and oov.csv if create_lexicon: create_lexicon_and_oov_csv(all_texts, data_folder, save_folder) # saving options save_pkl(conf, save_opt)
for u in utterances: spk_id = Path(u).parent.parent.stem if spk_id not in speakers: speakers[spk_id] = [u] else: speakers[spk_id].append(u) return speakers, words_dict # split split_f = params.split_factors # we get all noises and rirs noises = [] for f in params.noises_folders: noises.extend(get_all_files(f, match_and=[".wav"])) rirs = [] for f in params.rirs_folders: rirs.extend(get_all_files(f, match_and=[".wav"])) # we split them in training, dev and eval noises = split_list(noises, split_f) rirs = split_list(rirs, split_f) # do the same for background noises if params.backgrounds_root: backgrounds = get_all_files(params.backgrounds_root, match_and=[".wav"]) backgrounds = split_list(backgrounds, split_f) else: backgrounds = [None] * 3 os.makedirs(os.path.join(params.out_folder, "metadata"), exist_ok=True)