def prepare_librispeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, create_lexicon=False, skip_prep=False, ): """ This class prepares the csv files for the LibriSpeech dataset. Download link: http://www.openslr.org/12 Arguments --------- data_folder : str Path to the folder where the original LibriSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['test-others','train-clean-100', 'train-clean-360','train-other-500']. dev_splits : list List of dev splits to prepare from ['dev-clean','dev-others']. te_splits : list List of test splits to prepare from ['test-clean','test-others']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of librispeech splits (e.g, train-clean, train-clean-360,..) to merge in a singe csv file. merge_name: str Name of the merged csv file. create_lexicon: bool If True, it outputs csv files contaning mapping between graphene to phonemes. Use it for training a G2P system. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/LibriSpeech' >>> splits = ['train-clean-100', 'dev-clean', 'test-clean'] >>> save_folder = 'librispeech_prepared' >>> prepare_librispeech(data_folder, splits, save_folder) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains Librispeech check_librispeech_folders(data_folder, splits) # create csv files for each split all_texts = {} for split_index in range(len(splits)): split = splits[split_index] wav_lst = get_all_files(os.path.join(data_folder, split), match_and=[".flac"]) text_lst = get_all_files(os.path.join(data_folder, split), match_and=["trans.txt"]) text_dict = text_to_dict(text_lst) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_libri + ".csv" for split_libri in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # Create lexicon.csv and oov.csv if create_lexicon: create_lexicon_and_oov_csv(all_texts, data_folder, save_folder) # saving options save_pkl(conf, save_opt)
def prepare_voxceleb( data_folder, save_folder, verification_pairs_file, splits=["train", "dev", "test"], split_ratio=[90, 10], seg_dur=3.0, amp_th=5e-04, source=None, split_speaker=False, random_segment=False, skip_prep=False, ): """ Prepares the csv files for the Voxceleb1 or Voxceleb2 datasets. Please follow the instructions in the README.md file for preparing Voxceleb2. Arguments --------- data_folder : str Path to the folder where the original VoxCeleb dataset is stored. save_folder : str The directory where to store the csv files. verification_pairs_file : str txt file containing the verification split. splits : list List of splits to prepare from ['train', 'dev'] split_ratio : list List if int for train and validation splits seg_dur : int Segment duration of a chunk in seconds (e.g., 3.0 seconds). amp_th : float removes segments whose average amplitude is below the given threshold. source : str Path to the folder where the VoxCeleb dataset source is stored. split_speaker : bool Speaker-wise split random_segment : bool Train random segments skip_prep: Bool If True, skip preparation. Example ------- >>> from recipes.VoxCeleb.voxceleb1_prepare import prepare_voxceleb >>> data_folder = 'data/VoxCeleb1/' >>> save_folder = 'VoxData/' >>> splits = ['train', 'dev'] >>> split_ratio = [90, 10] >>> prepare_voxceleb(data_folder, save_folder, splits, split_ratio) """ if skip_prep: return # Create configuration for easily skipping data_preparation stage conf = { "data_folder": data_folder, "splits": splits, "split_ratio": split_ratio, "save_folder": save_folder, "seg_dur": seg_dur, "split_speaker": split_speaker, } if not os.path.exists(save_folder): os.makedirs(save_folder) # Setting ouput files save_opt = os.path.join(save_folder, OPT_FILE) save_csv_train = os.path.join(save_folder, TRAIN_CSV) save_csv_dev = os.path.join(save_folder, DEV_CSV) # Create the data folder contains VoxCeleb1 test data from the source if source is not None: if not os.path.exists(os.path.join(data_folder, "wav", "id10270")): logger.info(f"Extracting {source}/{TEST_WAV} to {data_folder}") shutil.unpack_archive(os.path.join(source, TEST_WAV), data_folder) if not os.path.exists(os.path.join(data_folder, "meta")): logger.info(f"Copying {source}/meta to {data_folder}") shutil.copytree(os.path.join(source, "meta"), os.path.join(data_folder, "meta")) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return # Additional checks to make sure the data folder contains VoxCeleb data if "," in data_folder: data_folder = data_folder.replace(" ", "").split(",") else: data_folder = [data_folder] # _check_voxceleb1_folders(data_folder, splits) msg = "\tCreating csv file for the VoxCeleb Dataset.." logger.info(msg) # Split data into 90% train and 10% validation (verification split) wav_lst_train, wav_lst_dev = _get_utt_split_lists(data_folder, split_ratio, verification_pairs_file, split_speaker) # Creating csv file for training data if "train" in splits: prepare_csv(seg_dur, wav_lst_train, save_csv_train, random_segment, amp_th) if "dev" in splits: prepare_csv(seg_dur, wav_lst_dev, save_csv_dev, random_segment, amp_th) # For PLDA verification if "test" in splits: prepare_csv_enrol_test(data_folder, save_folder, verification_pairs_file) # Saving options (useful to skip this phase when already done) save_pkl(conf, save_opt)
def prepare_ksponspeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, skip_prep=False, ): """ This class prepares the csv files for the KsponSpeech dataset. Download link: https://aihub.or.kr/aidata/105/download Arguments --------- data_folder : str Path to the folder where the original KsponSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['train', 'dev', 'eval_clean', 'eval_other']. dev_splits : list List of dev splits to prepare from ['dev']. te_splits : list List of test splits to prepare from ['eval_clean','eval_other']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of KsponSpeech splits (e.g, eval_clean, eval_other) to merge in a singe csv file. merge_name: str Name of the merged csv file. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/KsponSpeech' >>> tr_splits = ['train'] >>> dev_splits = ['dev'] >>> te_splits = ['eval_clean'] >>> save_folder = 'KsponSpeech_prepared' >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \ te_splits) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains ksponspeech check_ksponspeech_folders(data_folder, splits) # parse trn file all_texts = {} for split_index in range(len(splits)): split = splits[split_index] dirlist = split2dirs(split) wav_lst = [] for dir in dirlist: wav_lst += get_all_files(os.path.join(data_folder, dir), match_and=[".wav"]) trnpath = os.path.join(data_folder, split + ".trn") text_dict = text_to_dict(trnpath) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_kspon + ".csv" for split_kspon in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # saving options save_pkl(conf, save_opt)
def prepare_ami( data_folder, manual_annot_folder, save_folder, ref_rttm_dir, meta_data_dir, split_type="full_corpus_asr", skip_TNO=True, mic_type="Lapel", vad_type="oracle", max_subseg_dur=3.0, overlap=1.5, ): """ Prepares reference RTTM and JSON files for the AMI dataset. Arguments --------- data_folder : str Path to the folder where the original amicorpus is stored. manual_annot_folder : str Directory where the manual annotations are stored. save_folder : str The save directory in results. ref_rttm_dir : str Directory to store reference RTTM files. meta_data_dir : str Directory to store the meta data (json) files. split_type : str Standard dataset split. See ami_splits.py for more information. Allowed split_type: "scenario_only", "full_corpus" or "full_corpus_asr" skip_TNO: bool Skips TNO meeting recordings if True. mic_type : str Type of microphone to be used. vad_type : str Type of VAD. Kept for future when VAD will be added. max_subseg_dur : float Duration in seconds of a subsegments to be prepared from larger segments. overlap : float Overlap duration in seconds between adjacent subsegments Example ------- >>> from recipes.AMI.ami_prepare import prepare_ami >>> data_folder = '/network/datasets/ami/amicorpus/' >>> manual_annot_folder = '/home/mila/d/dawalatn/nauman/ami_public_manual/' >>> save_folder = 'results/save/' >>> split_type = 'full_corpus_asr' >>> mic_type = 'Lapel' >>> prepare_ami(data_folder, manual_annot_folder, save_folder, split_type, mic_type) """ # Meta files meta_files = [ os.path.join(meta_data_dir, "ami_train." + mic_type + ".subsegs.json"), os.path.join(meta_data_dir, "ami_dev." + mic_type + ".subsegs.json"), os.path.join(meta_data_dir, "ami_eval." + mic_type + ".subsegs.json"), ] # Create configuration for easily skipping data_preparation stage conf = { "data_folder": data_folder, "save_folder": save_folder, "ref_rttm_dir": ref_rttm_dir, "meta_data_dir": meta_data_dir, "split_type": split_type, "skip_TNO": skip_TNO, "mic_type": mic_type, "vad": vad_type, "max_subseg_dur": max_subseg_dur, "overlap": overlap, "meta_files": meta_files, } if not os.path.exists(save_folder): os.makedirs(save_folder) # Setting output option files. opt_file = "opt_ami_prepare." + mic_type + ".pkl" # Check if this phase is already done (if so, skip it) if skip(save_folder, conf, meta_files, opt_file): logger.info( "Skipping data preparation, as it was completed in previous run.") return msg = "\tCreating meta-data file for the AMI Dataset.." logger.debug(msg) # Get the split train_set, dev_set, eval_set = get_AMI_split(split_type) # Prepare RTTM from XML(manual annot) and store are groundtruth # Create ref_RTTM directory if not os.path.exists(ref_rttm_dir): os.makedirs(ref_rttm_dir) # Create reference RTTM files splits = ["train", "dev", "eval"] for i in splits: rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm" if i == "train": prepare_segs_for_RTTM( train_set, rttm_file, data_folder, manual_annot_folder, i, skip_TNO, ) if i == "dev": prepare_segs_for_RTTM( dev_set, rttm_file, data_folder, manual_annot_folder, i, skip_TNO, ) if i == "eval": prepare_segs_for_RTTM( eval_set, rttm_file, data_folder, manual_annot_folder, i, skip_TNO, ) # Create meta_files for splits meta_data_dir = meta_data_dir if not os.path.exists(meta_data_dir): os.makedirs(meta_data_dir) for i in splits: rttm_file = ref_rttm_dir + "/fullref_ami_" + i + ".rttm" meta_filename_prefix = "ami_" + i prepare_metadata( rttm_file, meta_data_dir, data_folder, meta_filename_prefix, max_subseg_dur, overlap, mic_type, ) save_opt_file = os.path.join(save_folder, opt_file) save_pkl(conf, save_opt_file)
def create_csv( wav_lst, csv_file, uppercase, data_folder, phn_set, kaldi_lab=None, kaldi_lab_opts=None, kaldi_lab_dir=None, ): """ Creates the csv file given a list of wav files. Arguments --------- wav_lst : list The list of wav files of a given data split. csv_file : str The path of the output csv file uppercase : bool Whether this is the uppercase version of timit. data_folder : str The location of the data. kaldi_lab : str, optional Default: None The path of the kaldi labels (optional). kaldi_lab_opts : str, optional Default: None A string containing the options used to compute the labels. Returns ------- None """ # Adding some Prints msg = "Creating csv lists in %s..." % (csv_file) logger.info(msg) # Reading kaldi labels if needed: snt_no_lab = 0 missing_lab = False if kaldi_lab is not None: lab = read_kaldi_lab( kaldi_lab, kaldi_lab_opts, ) if not os.path.exists(kaldi_lab_dir): os.makedirs(kaldi_lab_dir) csv_lines = [[ "ID", "duration", "wav", "wav_format", "wav_opts", "spk_id", "spk_id_format", "spk_id_opts", "phn", "phn_format", "phn_opts", "wrd", "wrd_format", "wrd_opts", "ground_truth_phn_ends", "ground_truth_phn_ends_format", "ground_truth_phn_ends_opts", ]] if kaldi_lab is not None: csv_lines[0].append("kaldi_lab") csv_lines[0].append("kaldi_lab_format") csv_lines[0].append("kaldi_lab_opts") # Processing all the wav files in the list for wav_file in wav_lst: # Getting sentence and speaker ids spk_id = wav_file.split("/")[-2] snt_id = wav_file.split("/")[-1].replace(".wav", "") snt_id = spk_id + "_" + snt_id if kaldi_lab is not None: if snt_id not in lab.keys(): missing_lab = False msg = ("The sentence %s does not have a corresponding " "kaldi label" % (snt_id)) logger.info(msg) snt_no_lab = snt_no_lab + 1 else: snt_lab_path = os.path.join(kaldi_lab_dir, snt_id + ".pkl") save_pkl(lab[snt_id], snt_lab_path) # If too many kaldi labels are missing rise an error if snt_no_lab / len(wav_lst) > 0.05: err_msg = ("Too many sentences do not have the " "corresponding kaldi label. Please check data and " "kaldi labels (check %s and %s)." % (data_folder, kaldi_lab)) logger.debutg(err_msg) if missing_lab: continue # Reading the signal (to retrieve duration in seconds) signal = read_audio(wav_file) duration = len(signal) / SAMPLERATE # Retrieving words and check for uppercase if uppercase: wrd_file = wav_file.replace(".WAV", ".WRD") else: wrd_file = wav_file.replace(".wav", ".wrd") if not os.path.exists(os.path.dirname(wrd_file)): err_msg = "the wrd file %s does not exists!" % (wrd_file) raise FileNotFoundError(err_msg) words = [line.rstrip("\n").split(" ")[2] for line in open(wrd_file)] words = " ".join(words) # Retrieving phonemes if uppercase: phn_file = wav_file.replace(".WAV", ".PHN") else: phn_file = wav_file.replace(".wav", ".phn") if not os.path.exists(os.path.dirname(phn_file)): err_msg = "the wrd file %s does not exists!" % (phn_file) raise FileNotFoundError(err_msg) # Getting the phoneme and ground truth ends lists from the phn files phonemes, ends = get_phoneme_lists(phn_file, phn_set) # Composition of the csv_line csv_line = [ snt_id, str(duration), wav_file, "wav", "", spk_id, "string", "", str(phonemes), "string", "", str(words), "string", "label:False", str(ends), "string", "label:False", ] if kaldi_lab is not None: csv_line.append(snt_lab_path) csv_line.append("pkl") csv_line.append("") # Adding this line to the csv_lines list csv_lines.append(csv_line) # Writing the csv lines _write_csv(csv_lines, csv_file) msg = "%s sucessfully created!" % (csv_file) logger.info(msg)
def prepare_timit( data_folder, splits, save_folder, kaldi_ali_tr=None, kaldi_ali_dev=None, kaldi_ali_test=None, kaldi_lab_opts=None, phn_set=39, uppercase=False, ): """ repares the csv files for the TIMIT dataset. Arguments --------- data_folder : str Path to the folder where the original TIMIT dataset is stored. splits : list List of splits to prepare from ['train', 'dev', 'test'] save_folder : str The directory where to store the csv files. kaldi_ali_tr : dict, optional Default: 'None' When set, this is the directiory where the kaldi training alignments are stored. They will be automatically converted into pkl for an easier use within speechbrain. kaldi_ali_dev : str, optional Default: 'None' When set, this is the path to directory where the kaldi dev alignments are stored. kaldi_ali_te : str, optional Default: 'None' When set, this is the path to the directory where the kaldi test alignments are stored. phn_set : {60, 48, 39}, optional, Default: 39 The phoneme set to use in the phn label. It could be composed of 60, 48, or 39 phonemes. uppercase : bool, optional Default: False This option must be True when the TIMIT dataset is in the upper-case version. Example ------- >>> from recipes.TIMIT.timit_prepare import prepare_timit >>> data_folder = 'datasets/TIMIT' >>> splits = ['train', 'dev', 'test'] >>> save_folder = 'TIMIT_prepared' >>> prepare_timit(data_folder, splits, save_folder) """ conf = { "data_folder": data_folder, "splits": splits, "kaldi_ali_tr": kaldi_ali_tr, "kaldi_ali_dev": kaldi_ali_dev, "kaldi_ali_test": kaldi_ali_test, "save_folder": save_folder, "phn_set": phn_set, "uppercase": uppercase, } # Getting speaker dictionary dev_spk, test_spk = _get_speaker() # Avoid calibration sentences avoid_sentences = ["sa1", "sa2"] # Setting file extension. extension = [".wav"] # Checking TIMIT_uppercase if uppercase: avoid_sentences = [item.upper() for item in avoid_sentences] extension = [item.upper() for item in extension] dev_spk = [item.upper() for item in dev_spk] test_spk = [item.upper() for item in test_spk] # Setting the save folder if not os.path.exists(save_folder): os.makedirs(save_folder) # Setting ouput files save_opt = os.path.join(save_folder, OPT_FILE) save_csv_train = os.path.join(save_folder, TRAIN_CSV) save_csv_dev = os.path.join(save_folder, DEV_CSV) save_csv_test = os.path.join(save_folder, TEST_CSV) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return # Additional checks to make sure the data folder contains TIMIT _check_timit_folders(uppercase, data_folder) msg = "Creating csv file for the TIMIT Dataset.." logger.info(msg) # Creating csv file for training data if "train" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TRAIN"] else: match_lst = extension + ["train"] wav_lst_train = get_all_files( data_folder, match_and=match_lst, exclude_or=avoid_sentences, ) create_csv( wav_lst_train, save_csv_train, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_tr, kaldi_lab_opts=kaldi_lab_opts, ) # Creating csv file for dev data if "dev" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TEST"] else: match_lst = extension + ["test"] wav_lst_dev = get_all_files( data_folder, match_and=match_lst, match_or=dev_spk, exclude_or=avoid_sentences, ) create_csv( wav_lst_dev, save_csv_dev, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_dev, kaldi_lab_opts=kaldi_lab_opts, ) # Creating csv file for test data if "test" in splits: # Checking TIMIT_uppercase if uppercase: match_lst = extension + ["TEST"] else: match_lst = extension + ["test"] wav_lst_test = get_all_files( data_folder, match_and=match_lst, match_or=test_spk, exclude_or=avoid_sentences, ) create_csv( wav_lst_test, save_csv_test, uppercase, data_folder, phn_set, kaldi_lab=kaldi_ali_test, kaldi_lab_opts=kaldi_lab_opts, ) # saving options save_pkl(conf, save_opt)