def prepare_explicit_word_level_tokens(self, tokenizer, EOT_token_id=None): """ loads all tokenized turn-level dialogs and inserts, either a special EOT_token_id (if not None) or the index of the next speaker token, in between the turns. """ tokenized_explicit_word_path = self.get_tokenized_root( level="word", explicit_turns=True, EOT_token_id=EOT_token_id, chunk_size=-1) if not self.check_if_dir_exists(tokenized_explicit_word_path, ".json"): self.prepare_word_level_tokens( tokenizer) # check the necessary data exists makedirs(tokenized_explicit_word_path, exist_ok=True) # Copy tokenizer info src = join(self.tokenized_word_level_root, "tokenizer_info") dst = join(tokenized_explicit_word_path, "tokenizer_info") shutil.copy(src, dst) tok_files = glob(join(self.tokenized_word_level_root, "*.json")) for tokenized_turn_level_path in tqdm( tok_files, desc=f"{self.NAME} Explicit turns"): tokenized_turn_level_dialog = read_json( tokenized_turn_level_path) explicit_turns = add_explicit_turn_shift_token( tokenized_turn_level_dialog, EOT_token_id) json_name = basename(tokenized_turn_level_path) write_json(explicit_turns, join(tokenized_explicit_word_path, json_name)) return tokenized_explicit_word_path
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() # downloads if necessary data = read_json(join(self.raw_data_root, "data.json")) test_filepaths = read_txt(join(self.raw_data_root, "testListFile.json")) val_filepaths = read_txt(join(self.raw_data_root, "valListFile.json")) train_filepaths = [] for session_name, v in tqdm(data.items(), desc=f"{self.NAME} Turn-level"): dialog = [] start = 0 for i, utt in enumerate(v["log"]): speaker_id = 0 if i % 2 == 0 else 1 dialog.append( { "text": utt["text"], "speaker_id": speaker_id, "start": start, } ) start += 1 # we only know which files are for validation and testing if not (session_name in test_filepaths or session_name in val_filepaths): train_filepaths.append(session_name) # save file write_json(dialog, join(self.turn_level_root, session_name)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def _process_word_level(self): print(f"{self.NAME}: process_word_level") # logger # check if word level exits print(self.word_level_root) if not self.check_if_dir_exists(self.word_level_root): print("world level data not found") if not (exists(self.raw_data_root) and isdir(self.raw_data_root)): print("raw data not found: ", self.raw_data_root) self.download_text() # Extract wordlevel makedirs(self.word_level_root, exist_ok=True) tu_path = join(self.raw_data_root, "Data/timed-units") pos_path = join(self.raw_data_root, "Data/pos") token_path = join(self.raw_data_root, "Data/tokens") dialog_ids = set([f.split(".")[0] for f in listdir(tu_path)]) for dialog_id in tqdm(dialog_ids, desc=f"{self.NAME} Dialogs"): tu_path_g = join(tu_path, dialog_id + ".g.timed-units.xml") tu_path_f = join(tu_path, dialog_id + ".f.timed-units.xml") dialog_words = self._extract_words(tu_path_g, speaker_id=0) dialog_words += self._extract_words(tu_path_f, speaker_id=1) dialog_words.sort(key=lambda x: x["start"]) write_json( dialog_words, join(self.word_level_root, dialog_id + ".json") )
def _process_turn_level(self): print(f"{self.NAME}: process_turn_level (slow)") # From super class. makes sure that the data we need exists self.prepare_word_level() # word-level-dialog required self.prepare_vad() # processed vad values required # Extract Turn level makedirs(self.turn_level_root, exist_ok=True) # loop over entries in the word-level processing and transform to turns word_level_files = glob(join(self.word_level_root, "*.json")) for word_level_path in tqdm(word_level_files): json_name = basename(word_level_path) audio_path = self.get_audio_path(json_name.replace(".json", "")) vad_path = join(self.vad_root, json_name.replace(".json", ".pt")) word_level_dialog = read_json(word_level_path) vad = torch.load(vad_path) # list of (start, end) times duration = get_duration_sox(audio_path) sr = get_sample_rate_sox(audio_path) word_level_turns = word_level_to_turns( word_level_dialog, vad, duration, sr, ) write_json(word_level_turns, join(self.turn_level_root, json_name))
def _process_turn_level(self): """ Iterate over rows in txt file and clean the text (e.g: don ´ t => don't, tonight ? => tonight?), add speaker_id, dialog_act (integer) and emotion (integer). save each dialog in a json file: [ {'text', 'speaker_id', 'start', 'act', 'emotion'}, ..., {'text', 'speaker_id', 'start', 'act', 'emotion'}, ] """ makedirs(self.turn_level_root, exist_ok=True) self.download_text() # make sure data is accessable dialog_num = 0 print(f"{self.NAME} Turn-level") for split in ["train", "test", "validation"]: dialog_text = read_txt( join(self.raw_data_root, split, f"dialogues_{split}.txt")) dialog_emotion = read_txt( join(self.raw_data_root, split, f"dialogues_emotion_{split}.txt")) dialog_act = read_txt( join(self.raw_data_root, split, f"dialogues_act_{split}.txt")) filepaths = [] for text, emotion, act in tqdm( zip(dialog_text, dialog_emotion, dialog_act), desc=split, total=len(dialog_act), ): text = text.strip().split(self.EOT)[:-1] emotion = emotion.split() act = act.split() conversation = [] for i, (t, e, a) in enumerate(zip(text, emotion, act)): if i % 2 == 0: speaker_id = 0 else: speaker_id = 1 conversation.append({ "text": clean_daily(t), "speaker_id": speaker_id, "act": int(a), "emotion": int(e), "start": i, }) savename = f"dd{dialog_num}.json" write_json(conversation, join(self.turn_level_root, savename)) filepaths.append(savename) dialog_num += 1 if split == "validation": write_txt(filepaths, join(self.root, f"val.txt")) else: write_txt(filepaths, join(self.root, f"{split}.txt"))
def prepare_turn_level_tokens(self, tokenizer): if not self.check_if_dir_exists(self.tokenized_turn_level_root, ".json"): self.prepare_turn_level() makedirs(self.tokenized_turn_level_root, exist_ok=True) # TOKENIZER SANITY CHECK _ = tokenizer_info(tokenizer, self.tokenized_turn_level_root ) # Save tokenizer info for checks t = time.time() broken = 0 broken_files = [] for turn_level_path in tqdm( glob(join(self.turn_level_root, "*.json")), desc=f"Tokenizing Turn-level {self.NAME}", ): turn_level_dialog = read_json(turn_level_path) ( input_ids, speaker_ids, word_ids, starts, ends, ) = tokenize_turn_level_dialog(turn_level_dialog, tokenizer, remove_punctuation=True) if len(input_ids) > 1: data = { "input_ids": input_ids, "speaker_ids": speaker_ids, "word_ids": word_ids, } if len(starts) > 0: data["starts"] = starts if len(ends) > 0: data["ends"] = ends write_json( data, join(self.tokenized_turn_level_root, basename(turn_level_path)), ) else: broken_files.append(basename(turn_level_path)) t = time.time() - t print(f"{self.NAME} tokenization took {round(t, 1)} seconds") if len(broken_files) > 0: print(f"{self.NAME} broken", broken) write_txt(broken_files, join(self.root, "broken_tokenize.txt")) return self.tokenized_turn_level_root
def tokenizer_info(tokenizer, dirpath=None): tokenizer_info = { "name": tokenizer.__class__.__name__, "vocab_size": tokenizer.vocab_size, "len": len(tokenizer), "special_tokens_map": tokenizer.special_tokens_map, } if dirpath is not None: write_json(tokenizer_info, join(dirpath, "tokenizer_info")) return tokenizer_info
def prepare_word_level_tokens(self, tokenizer): if not self.check_if_dir_exists(self.tokenized_word_level_root): self.prepare_word_level() makedirs(self.tokenized_word_level_root, exist_ok=True) # TOKENIZER SANITY CHECK _ = tokenizer_info(tokenizer, self.tokenized_word_level_root ) # Save tokenizer info for checks desc = f"Tokenizing Word-level {self.NAME}" t = time.time() broken = 0 broken_files = [] for word_level_path in tqdm(glob( join(self.word_level_root, "*.json")), desc=desc): json_name = basename(word_level_path) word_level_dialog = read_json(word_level_path) ( input_ids, speaker_ids, word_ids, starts, ends, ) = tokenize_word_level_dialog( word_level_dialog, tokenizer, ) if len(input_ids) > 1: write_json( { "input_ids": input_ids, "speaker_ids": speaker_ids, "starts": starts, "ends": ends, "word_ids": word_ids, }, join(self.tokenized_word_level_root, json_name), ) else: broken_files.append(json_name) t = time.time() - t print(f"{self.NAME} tokenization took {round(t, 1)} seconds") if len(broken_files) > 0: print(f"{self.NAME} broken", broken) write_txt(broken_files, join(self.root, "broken_tokenize.txt")) return self.tokenized_word_level_root
def _process_turn_level(self): """ There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test """ makedirs(self.turn_level_root, exist_ok=True) self.download_text() # make sure the data is accesable total, skipped = 0, 0 filenames = [] data = read_json(join(self.raw_data_root, "data.json")) for dialog in data: filename = dialog["conversationId"] + ".json" conversation = [] for utt in dialog["utterances"]: speaker_id = 1 if utt["speaker"] == "ASSISTANT": speaker_id = 0 conversation.append( { "text": utt["text"], "speaker_id": speaker_id, "start": utt["index"], } ) conversation = join_consecutive_utterances(conversation) if len(conversation) > 1: write_json(conversation, join(self.turn_level_root, filename)) filenames.append(filename) total += 1 else: skipped += 1 train_filepaths, val_filepaths, test_filepaths = self._create_splits(filenames) print(self.NAME) print("Skipped: ", skipped) print("Total dialogs: ", total) print("Train: ", len(train_filepaths)) print("Val: ", len(val_filepaths)) print("Test: ", len(test_filepaths)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def prepare_pos(self): if not self.check_if_dir_exists(self.pos_root): makedirs(self.pos_root, exist_ok=True) # Makes sure that the data we need exists self.prepare_turn_level() # Iterate over the turn_level_dialogs and constructs vad base on the duration # (using the audio path and the sox to extract the duration of the audio) files = glob(join(self.turn_level_root, "*.json")) for turn_level_path in tqdm(files, desc=f"{self.NAME} POS"): turn_level_dialog = read_json(turn_level_path) pos, words = extract_turn_level_pos(turn_level_dialog) write_json( { "pos": pos, "words": words }, join(self.pos_root, basename(turn_level_path)), ) return self.pos_root
def _process_turn_level(self): """ The super class contains higher level functions used by diffent dataset such as "prepare_turn_level". Theses prepare classes checks if the files exists but if they do not it calls the dataset specific '_process_turn_level' which extract the relevant data. """ print(f"{self.NAME}: process_turn_level (slow)") # From super class. makes sure that the data we need exists self.prepare_word_level() # word-level-dialog required self.prepare_vad() # processed vad values required # Extract Turn level makedirs(self.turn_level_root, exist_ok=True) # loop over entries in the word-level processing and transform to turns word_level_files = glob(join(self.word_level_root, "*.json")) for word_level_path in tqdm(word_level_files): json_name = basename(word_level_path) audio_path = self.get_audio_path(json_name.replace(".json", "")) vad_path = join(self.vad_root, json_name.replace(".json", ".pt")) word_level_dialog = read_json(word_level_path) vad = torch.load(vad_path) # list of (start, end) times duration = get_duration_sox(audio_path) sr = get_sample_rate_sox(audio_path) word_level_turns = word_level_to_turns( word_level_dialog, vad, duration, sr, # vad_step_time=vad_step_time, # vad_pad=vad_pad, # ipu_thresh=ipu_thresh, ) write_json(word_level_turns, join(self.turn_level_root, json_name))
def prepare_chunked_tokens(self, tokenized_path, chunk_size, overlap, keep_length, sep="_#"): assert chunk_size > 0, "chunk size must be larger than 0" tokenized_chunk_path = tokenized_path + f"_chunk-{chunk_size}" if not self.check_if_dir_exists(tokenized_chunk_path, ".json"): print(f"Chunk {self.NAME} -> {chunk_size}") makedirs(tokenized_chunk_path, exist_ok=True) # Copy tokenizer used # tokenizer_info(tokenizer, self.tokenized_turn_level_root) src = join(tokenized_path, "tokenizer_info") dst = join(tokenized_chunk_path, "tokenizer_info") shutil.copy(src, dst) tokenized_files = glob(join(tokenized_path, "*.json")) for json_path in tqdm(tokenized_files, desc=f"{self.NAME} Chunk"): tokenized_dialog = read_json(json_path) chunked_dialogs = chunk_tokenized_dialog( tokenized_dialog, chunk_size, overlap, keep_length) # Save the chunked files name = basename(json_path).replace(".json", "") for i, chunked_dialog in enumerate(chunked_dialogs): tmp_name = name if i > 0: tmp_name += sep + str(i) write_json(chunked_dialog, join(tokenized_chunk_path, tmp_name + ".json")) print("Chunk size: ", chunk_size) return tokenized_chunk_path
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() files = [ "test_none_original.txt", "train_none_original.txt", "valid_none_original.txt", ] train_files, val_files, test_files = [], [], [] dialog_num = 0 for file in files: dialogs = read_txt(join(self.raw_data_root, file)) filepaths = [] tmp_turns = [] turn_ind = 0 start = 0 for d in tqdm(dialogs, desc=file): n = int(d[0]) if n > turn_ind: # conversation continues utts = d.split("\t")[:2] t1 = re.sub(r"^(\d+)\s", "", utts[0]) t1 = clean_persona(t1) tmp_turns.append({ "text": t1, "speaker_id": 0, "start": start }) start += 1 t2 = clean_persona(utts[1]) tmp_turns.append({ "text": t2, "speaker_id": 1, "start": start }) start += 1 turn_ind = n else: # save dialog filename = f"persona{dialog_num}.json" write_json(tmp_turns, join(self.turn_level_root, filename)) filepaths.append(filename) # Reset ------------------------------------------------- dialog_num += 1 tmp_turns = [] start = 0 turn_ind = n # first in this dialog ---------------------------------- t1 = re.sub(r"^(\d+)\s", "", utts[0]) t1 = clean_persona(t1) tmp_turns.append({ "text": t1, "speaker_id": 0, "start": start }) start += 1 t2 = clean_persona(utts[1]) tmp_turns.append({ "text": t2, "speaker_id": 1, "start": start }) start += 1 if "train" in file: write_txt(filepaths, join(self.root, "train.txt")) elif "valid" in file: write_txt(filepaths, join(self.root, "val.txt")) else: write_txt(filepaths, join(self.root, "test.txt"))
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() train_filepaths = [] val_filepaths = [] test_filepaths = [] total, skipped = 0, 0 t = time.time() for json_path in tqdm(glob(join(self.raw_data_root, "*.json")), desc=self.NAME): data_name = basename(json_path).replace(".json", "") dialogs = read_json(json_path) if data_name == "TM1_self": self_train = read_txt(join(self.raw_data_root, "tm1_train.txt")) self_val = read_txt(join(self.raw_data_root, "tm1_dev.txt")) self_test = read_txt(join(self.raw_data_root, "tm1_test.txt")) # clean comma (originally a csv file) self_train = [f.strip(",") for f in self_train] self_val = [f.strip(",") for f in self_val] self_test = [f.strip(",") for f in self_test] filenames = [] for dialog_data in dialogs: filename = "-".join([ data_name, dialog_data["conversation_id"], dialog_data["instruction_id"], ]) # _ is used when concatenating dsets filename = filename.replace("_", "-") filename += ".json" # filename too long? dialog = self._extract_turn_level_dialogs(dialog_data) if dialog is None or len(dialog) < 2: skipped += 1 else: dialog = join_consecutive_utterances(dialog) if len(dialog) > 1: write_json(dialog, join(self.turn_level_root, filename)) total += 1 # tm1_self_dialogs contain predefined train/dev/test splits # using dev as val. if data_name == "TM1_self": if dialog_data["conversation_id"] in self_train: train_filepaths.append(filename) elif dialog_data["conversation_id"] in self_val: val_filepaths.append(filename) else: # test test_filepaths.append(filename) else: filenames.append(filename) # create splits for each data-group (restaurants, hotels, etc) if data_name != "TM1_self": train, val, test = self._create_splits(filenames) train_filepaths += train val_filepaths += val test_filepaths += test t = time.time() - t print(f"Preprocessing took {round(t, 1)} seconds.") print("Skipped: ", skipped) print("Total dialogs: ", total) print("Train: ", len(train_filepaths)) print("Val: ", len(val_filepaths)) print("Test: ", len(test_filepaths)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def _process_turn_level(self): """ There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test Dialog data looks like this: string-data = '{ "id": "af15eaa8", "user_id": "19b006ed", "bot_id": "3f60b0cb", "domain": "TIME_ZONE", "task_id": "c60e80fc", "turns": ["Hello how may I help you?", "Hi there, could you explain to me how time zones work? I don't really understand it", "Hey! I am only able to calculate times in diffferent tome zones.", "Oh, so you can't explain how they work, you can only find out local times?", "Correct", "Alright, could you tell me what time it is in Los Angeles now?", "It is currently 1:25am in Los Angeles", "Great! And do you know if Sacramento is in the same time zone?", "It is indeed", "Alright, thanks for the info", "Happy to help!"] }' """ makedirs(self.turn_level_root, exist_ok=True) self.download_text() # downloads if necessary train_filepaths = [] val_filepaths = [] test_filepaths = [] total, skipped = 0, 0 t = time.time() for datafile in tqdm(glob(join(self.raw_data_root, "*.txt")), desc=f"{self.NAME} Turn-level"): filename = basename(datafile) if filename == "tasks.txt": continue filenames = [] data = read_txt(datafile) for string_data in data: dict_data = json.loads(string_data) # dict_data.keys(): ['id', 'user_id', 'bot_id', 'domain', 'task_id', 'turns'] conversation = [] for i, utt in enumerate(dict_data["turns"]): if i % 2 == 0: speaker_id = 0 else: speaker_id = 1 conversation.append({ "text": utt, "speaker_id": speaker_id, "start": i, }) if len(conversation) > 1: savename = "-".join([dict_data["domain"], dict_data["id"]]) savename += ".json" write_json(conversation, join(self.turn_level_root, savename)) filenames.append(savename) total += 1 else: skipped += 1 # create splits for each domain train, val, test = self._create_splits(filenames) train_filepaths += train val_filepaths += val test_filepaths += test t = time.time() - t print(self.NAME) print(f"Preprocessing took {round(t, 1)} seconds.") print("Skipped: ", skipped) print("Total dialogs: ", total) print("Train: ", len(train_filepaths)) print("Val: ", len(val_filepaths)) print("Test: ", len(test_filepaths)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def _process_word_level(self): print(f"{self.NAME}: process_word_level") # logger # check if word level exits if not self.check_if_dir_exists(self.word_level_root): print(f"{self.NAME}: world level data not found") if not (exists(self.raw_data_root) and isdir(self.raw_data_root)): print("raw data not found: ", self.raw_data_root) self.download_text() # Extract wordlevel makedirs(self.word_level_root, exist_ok=True) omitted_words = 0 changed_words = 0 total_words = 0 for DD in tqdm(listdir(self.raw_data_root), desc=f"{self.NAME} Word level dialogs"): DD = join(self.raw_data_root, DD) if isdir(DD): for dialog_number in listdir(DD): dialog_id = f"sw{dialog_number}" A_words = read_txt( join(DD, dialog_number, dialog_id + "A-ms98-a-word.text")) # speaker A B_words = read_txt( join(DD, dialog_number, dialog_id + "B-ms98-a-word.text")) # speaker B # A/B_words is a list of strings # each string: # 'sw3856A-ms98-a-0002 0.925625 1.233875 what' # '{dialog_id}{speaker_id}-ms98-{utt_id} {start} {end} {word}' # dialog_id = sw3856 # speaker_id = a-0002 # start = 0.925625 # end = 1.233875 # word = what dialog_words = [] for speaker_id, word_list in enumerate( [A_words, B_words]): for word_data in word_list: id, start, end, word = word_data.split() if word == "[silence]": continue total_words += 1 if word in SWB_OMIT: omitted_words += 1 continue w = clean_swb_word(word) if w != word: changed_words += 1 start = float(start) end = float(end) utt_id = id.split("ms98-")[-1] # utterance id dialog_words.append({ "word": w, "start": start, "end": end, "utt_id": utt_id, "speaker_id": speaker_id, # 0 or 1 }) # sort words by start dialog_words = [ dw for dw in sorted(dialog_words, key=lambda item: item["start"]) ] write_json( dialog_words, join(self.word_level_root, dialog_id + ".json"), ) print( f"Omitted {omitted_words} {round(100 * omitted_words / total_words, 3)}% of words" ) print( f"Changed {changed_words} {round(100 * changed_words / total_words, 3)}% of words" ) print("-" * 50)
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() train_filepaths = [] val_filepaths = [] test_filepaths = [] # header index # 0: conv_id # 1: utterance_idx # 2: context # 3: prompt # 4: speaker_idx # 5: utterance # 6: selfeval # 7: tags omitted = 0 n = 0 files = ["train.csv", "valid.csv", "test.csv"] for filename in files: data = open(join(self.raw_data_root, filename)).readlines() filepaths = [] dialog = [] omit_next_dialog = False last_conv_id = data[1].strip().split(",")[0] for i in tqdm(range(1, len(data)), desc=f"{self.NAME} Turn-level ({filename})" ): # skip header row = data[i].strip().split(",") conv_id = row[0] utt_idx = int(row[1]) speaker_id = (utt_idx + 1) % 2 # starts on utt_idx = 1 -> speaker_id = 0 utt = { "text": clean_empathetic(row[5]), "speaker_id": speaker_id, "start": utt_idx, "emotion": row[2], "id": conv_id, } if "|" in utt["text"]: omit_next_dialog = True if last_conv_id == conv_id: dialog.append(utt) else: if not omit_next_dialog: savename = f"emp{n}.json" write_json(dialog, join(self.turn_level_root, savename)) filepaths.append(savename) n += 1 else: omitted += 1 omit_next_dialog = False dialog = [utt] last_conv_id = conv_id if "train" in filename: write_txt(filepaths, join(self.root, "train.txt")) elif "valid" in filename: write_txt(filepaths, join(self.root, "val.txt")) else: write_txt(filepaths, join(self.root, "test.txt")) print(f"Omitted: {omitted} dialogs")