def get_data_loaders(config, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) #personachat["train"] = personachat["train"][:100] #personachat["valid"] = personachat["valid"][:10] logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "valid": defaultdict(list)} c = 0 for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) if config.num_candidates > 0 and dataset_name == 'train': num_candidates = min(config.num_candidates, num_candidates) for dialog in dataset: topic = dialog["topic"] for utterance in dialog["utterances"]: history = utterance["history"][-(2 * config.max_history+1):] emotions = utterance["emotion"][-(2 * config.max_history + 1):] actions = utterance["act"][-(2 * config.max_history+1):] for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates candidate_emotion = utterance['candidates_emotions'][j] candidate_act = utterance['candidates_acts'][j] instance, _ = build_input_from_segments(topic, history, emotions, actions, candidate, candidate_emotion, candidate_act, tokenizer, lm_labels) #print(len(instance["input_ids"])) if len(instance["input_ids"]) > 310: truncated_history = [hist[:10] for hist in history] truncated_candidate = candidate[:10] instance, _ = build_input_from_segments(topic, truncated_history, emotions, actions, truncated_candidate, candidate_emotion, candidate_act, tokenizer, lm_labels) c+=1 for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append(input_array) datasets[dataset_name]["mc_labels"].append(num_candidates - 1) datasets[dataset_name]["n_candidates"] = num_candidates print(c) logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) for input_name in MODEL_INPUTS: tensor = torch.tensor(dataset[input_name]) if input_name != "mc_labels": tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) tensor_datasets[dataset_name].append(tensor) logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) return train_loader, valid_loader, train_sampler, valid_sampler
def get_data_loaders(config, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS) # personachat["train"] = personachat["train"][:100] # personachat["valid"] = personachat["valid"][:10] logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "valid": defaultdict(list)} gpu_max_length = 310 for dataset_name, dataset in personachat.items(): num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"]) if config.num_candidates > 0 and dataset_name == 'train': num_candidates = min(config.num_candidates, num_candidates) for dialog in dataset: for utterance in dialog["utterances"]: history = utterance["history"][-(2 * config.max_history + 1):] emotions = utterance["emotion"][-(2 * config.max_history + 1):] reply = utterance["candidates"][-1] true_emotion = utterance['candidates_emotions'][-1] if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]: continue instance, _ = build_input_from_segments(history, emotions, reply, true_emotion, tokenizer) if len(instance["input_ids"]) > gpu_max_length: truncated_history = [hist[:10] for hist in history] truncated_candidate = reply[:10] true_emotion = utterance['candidates_emotions'][-1] instance, _ = build_input_from_segments(truncated_history, emotions, truncated_candidate, true_emotion, tokenizer) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append(input_array) datasets[dataset_name]["n_candidates"] = num_candidates logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) for input_name in MODEL_INPUTS: tensor = torch.tensor(dataset[input_name]) #if input_name != "mc_labels": # tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) tensor_datasets[dataset_name].append(tensor) logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False) logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) return train_loader, valid_loader, train_sampler, valid_sampler