def prepare_librispeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, create_lexicon=False, skip_prep=False, ): """ This class prepares the csv files for the LibriSpeech dataset. Download link: http://www.openslr.org/12 Arguments --------- data_folder : str Path to the folder where the original LibriSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['test-others','train-clean-100', 'train-clean-360','train-other-500']. dev_splits : list List of dev splits to prepare from ['dev-clean','dev-others']. te_splits : list List of test splits to prepare from ['test-clean','test-others']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of librispeech splits (e.g, train-clean, train-clean-360,..) to merge in a singe csv file. merge_name: str Name of the merged csv file. create_lexicon: bool If True, it outputs csv files contaning mapping between graphene to phonemes. Use it for training a G2P system. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/LibriSpeech' >>> splits = ['train-clean-100', 'dev-clean', 'test-clean'] >>> save_folder = 'librispeech_prepared' >>> prepare_librispeech(data_folder, splits, save_folder) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains Librispeech check_librispeech_folders(data_folder, splits) # create csv files for each split all_texts = {} for split_index in range(len(splits)): split = splits[split_index] wav_lst = get_all_files(os.path.join(data_folder, split), match_and=[".flac"]) text_lst = get_all_files(os.path.join(data_folder, split), match_and=["trans.txt"]) text_dict = text_to_dict(text_lst) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_libri + ".csv" for split_libri in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # Create lexicon.csv and oov.csv if create_lexicon: create_lexicon_and_oov_csv(all_texts, data_folder, save_folder) # saving options save_pkl(conf, save_opt)
def prepare_TAS(data_folder, save_folder, type, train_splits, skip_prep=False): """ This function prepares the Timers and Such dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to Timers and Such dataset. save_folder: path there to save the csv manifest files. type : one of the following: "direct":{input=audio, output=semantics} "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle) "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts) train_splits : list of splits to be joined to form train .csv skip_prep: If True, skip data preparation """ if skip_prep: return if type == "decoupled": try: import inflect p = inflect.engine() except ModuleNotFoundError: logger.info( 'Error: the inflect module must be installed to run the "decoupled" SLU recipe.' ) logger.info("Install using `pip install inflect`.") raise # If the data folders do not exist, we need to extract the data if not os.path.isdir(os.path.join(data_folder, "train-synth")): # Check for zip file and download if it doesn't exist zip_location = os.path.join(data_folder, "timers-and-such.zip") if not os.path.exists(zip_location): url = "https://zenodo.org/record/4623772/files/timers-and-such-v1.0.zip?download=1" download_file(url, zip_location, unpack=True) else: logger.info("Extracting timers-and-such.zip...") shutil.unpack_archive(zip_location, data_folder) splits = [ "train-real", "dev-real", "test-real", "train-synth", "dev-synth", "test-synth", ] ID_start = 0 # needed to have a unique ID for each audio for split in splits: new_filename = os.path.join(save_folder, split) + "-type=%s.csv" % type if os.path.exists(new_filename): continue logger.info("Preparing %s..." % new_filename) ID = [] duration = [] wav = [] wav_format = [] wav_opts = [] spk_id = [] spk_id_format = [] spk_id_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] df = pd.read_csv(os.path.join(data_folder, split) + ".csv") for i in range(len(df)): ID.append(ID_start + i) signal = read_audio(os.path.join(data_folder, df.path[i])) duration.append(signal.shape[0] / 16000) wav.append(os.path.join(data_folder, df.path[i])) wav_format.append("wav") wav_opts.append(None) spk_id.append(df.speakerId[i]) spk_id_format.append("string") spk_id_opts.append(None) transcript_ = df.transcription[i] if type == "decoupled": words = transcript_.split() for w in range(len(words)): words[w] = words[w].upper() # If the word is numeric, we need to convert it to letters, to match what the ASR would output. if any(c.isdigit() for c in words[w]): if "AM" in words[w] or "PM" in words[w]: AM_or_PM = "A M" if "AM" in words[w] else "P M" if ":" in words[w]: hour = words[w].split(":")[0] minute = ( words[w].split(":")[1].split("AM")[0] if "AM" in words[w] else words[w].split(":")[1].split("PM")[0]) words[w] = (p.number_to_words(hour).upper() + " " + p.number_to_words(minute).upper() + " " + AM_or_PM) else: hour = (words[w].split("AM")[0] if "AM" in words[w] else words[w].split("PM")[0]) words[w] = (p.number_to_words(hour).upper() + " " + AM_or_PM) else: words[w] = p.number_to_words(words[w]).upper() transcript_ = " ".join(words).replace("-", " ") transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_ = df.semantics[i].replace( ".3333333333333333", ".33") # Fix formatting error in some labels if type == "direct" or type == "multistage" or type == "decoupled": semantics.append(semantics_) if type == "joint-transcript-semantics": semantics.append("{'transcript': '" + transcript_ + "'| " + semantics_[1:]) if type == "joint-semantics-transcript": semantics.append(semantics_[:-1] + "| 'transcript': '" + transcript_ + "'}") semantics_format.append("string") semantics_opts.append(None) new_df = pd.DataFrame({ "ID": ID, "duration": duration, "wav": wav, "spk_id": spk_id, "semantics": semantics, "transcript": transcript, }) new_df.to_csv(new_filename, index=False) ID_start += len(df) # Merge train splits train_splits = [split + "-type=%s.csv" % type for split in train_splits] merge_csvs(save_folder, train_splits, "train-type=%s.csv" % type) # Create "all-real" split real_splits = [ split + "-type=%s.csv" % type for split in ["train-real", "dev-real", "test-real"] ] merge_csvs(save_folder, real_splits, "all-real-type=%s.csv" % type)
def prepare_ksponspeech( data_folder, save_folder, tr_splits=[], dev_splits=[], te_splits=[], select_n_sentences=None, merge_lst=[], merge_name=None, skip_prep=False, ): """ This class prepares the csv files for the KsponSpeech dataset. Download link: https://aihub.or.kr/aidata/105/download Arguments --------- data_folder : str Path to the folder where the original KsponSpeech dataset is stored. tr_splits : list List of train splits to prepare from ['train', 'dev', 'eval_clean', 'eval_other']. dev_splits : list List of dev splits to prepare from ['dev']. te_splits : list List of test splits to prepare from ['eval_clean','eval_other']. save_folder : str The directory where to store the csv files. select_n_sentences : int Default : None If not None, only pick this many sentences. merge_lst : list List of KsponSpeech splits (e.g, eval_clean, eval_other) to merge in a singe csv file. merge_name: str Name of the merged csv file. skip_prep: bool If True, data preparation is skipped. Example ------- >>> data_folder = 'datasets/KsponSpeech' >>> tr_splits = ['train'] >>> dev_splits = ['dev'] >>> te_splits = ['eval_clean'] >>> save_folder = 'KsponSpeech_prepared' >>> prepare_ksponspeech(data_folder, save_folder, tr_splits, dev_splits, \ te_splits) """ if skip_prep: return data_folder = data_folder splits = tr_splits + dev_splits + te_splits save_folder = save_folder select_n_sentences = select_n_sentences conf = { "select_n_sentences": select_n_sentences, } # Other variables # Saving folder if not os.path.exists(save_folder): os.makedirs(save_folder) save_opt = os.path.join(save_folder, OPT_FILE) # Check if this phase is already done (if so, skip it) if skip(splits, save_folder, conf): logger.info("Skipping preparation, completed in previous run.") return else: logger.info("Data_preparation...") # Additional checks to make sure the data folder contains ksponspeech check_ksponspeech_folders(data_folder, splits) # parse trn file all_texts = {} for split_index in range(len(splits)): split = splits[split_index] dirlist = split2dirs(split) wav_lst = [] for dir in dirlist: wav_lst += get_all_files(os.path.join(data_folder, dir), match_and=[".wav"]) trnpath = os.path.join(data_folder, split + ".trn") text_dict = text_to_dict(trnpath) all_texts.update(text_dict) if select_n_sentences is not None: n_sentences = select_n_sentences[split_index] else: n_sentences = len(wav_lst) create_csv( save_folder, wav_lst, text_dict, split, n_sentences, ) # Merging csv file if needed if merge_lst and merge_name is not None: merge_files = [split_kspon + ".csv" for split_kspon in merge_lst] merge_csvs( data_folder=save_folder, csv_lst=merge_files, merged_csv=merge_name, ) # saving options save_pkl(conf, save_opt)
def prepare_SLURP(data_folder, slu_type, train_splits): """ This function prepares the SLURP dataset. If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded. data_folder : path to SLURP dataset. slu_type : one of the following: "direct":{input=audio, output=semantics} "multistage":{input=audio, output=semantics} (using ASR transcripts in the middle) "decoupled":{input=transcript, output=semantics} (using ground-truth transcripts) train_splits : list of splits to be joined to form train .csv """ # If the data folders do not exist, we need to extract the data # if not os.path.isdir(os.path.join(data_folder, "train-synth")): # Check for zip file and download if it doesn't exist # zip_location = os.path.join(data_folder, "timers-and-such.zip") # if not os.path.exists(zip_location): # url = "https://zenodo.org/record/4110812/files/timers-and-such.zip?download=1" # download_file(url, zip_location, unpack=True) # else: # print("Extracting timers-and-such.zip...") # shutil.unpack_archive(zip_location, data_folder) splits = [ "train", "train_synthetic", "devel", "test", ] id = 0 for split in splits: new_filename = (os.path.join(data_folder, split) + "-type=%s.csv" % slu_type) if os.path.exists(new_filename): continue print("Preparing %s..." % new_filename) IDs = [] duration = [] wav = [] wav_format = [] wav_opts = [] semantics = [] semantics_format = [] semantics_opts = [] transcript = [] transcript_format = [] transcript_opts = [] with jsonlines.open( os.path.join(data_folder, "dataset/slurp/" + split + ".jsonl")) as reader: for obj in reader: scenario = obj["scenario"] action = obj["action"] sentence_annotation = obj["sentence_annotation"] num_entities = sentence_annotation.count("[") entities = [] for slot in range(num_entities): type = (sentence_annotation.split("[")[slot + 1].split("]") [0].split(":")[0].strip()) filler = (sentence_annotation.split("[")[slot + 1].split( "]")[0].split(":")[1].strip()) entities.append({"type": type, "filler": filler}) for recording in obj["recordings"]: IDs.append(id) if "synthetic" in split: audio_folder = "scripts/audio/slurp_synth/" else: audio_folder = "scripts/audio/slurp_real/" path = os.path.join(data_folder, audio_folder, recording["file"]) signal = read_wav_soundfile(path) duration.append(signal.shape[0] / 16000) wav.append(path) wav_format.append("flac") wav_opts.append(None) transcript_ = obj["sentence"] if slu_type == "decoupled": transcript_ = transcript_.upper() transcript.append(transcript_) transcript_format.append("string") transcript_opts.append(None) semantics_dict = { "scenario": scenario, "action": action, "entities": entities, } semantics_ = str(semantics_dict).replace( ",", "|" ) # Commas in dict will make using csv files tricky; replace with pipe. semantics.append(semantics_) semantics_format.append("string") semantics_opts.append(None) id += 1 df = pd.DataFrame({ "ID": IDs, "duration": duration, "wav": wav, "wav_format": wav_format, "wav_opts": wav_opts, "semantics": semantics, "semantics_format": semantics_format, "semantics_opts": semantics_opts, "transcript": transcript, "transcript_format": transcript_format, "transcript_opts": transcript_opts, }) df.to_csv(new_filename, index=False) # Merge train splits train_splits = [ split + "-type=%s.csv" % slu_type for split in train_splits ] merge_csvs(data_folder, train_splits, "train-type=%s.csv" % slu_type)