コード例 #1
0
def get_data_loaders(config, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)

    #personachat["train"] = personachat["train"][:100]
    #personachat["valid"] = personachat["valid"][:10]


    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    c = 0
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if config.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(config.num_candidates, num_candidates)
        for dialog in dataset:
            topic = dialog["topic"]
            for utterance in dialog["utterances"]:
                history = utterance["history"][-(2 * config.max_history+1):]
                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
                actions = utterance["act"][-(2 * config.max_history+1):]
                for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                    lm_labels = bool(j == num_candidates-1) #the true label is always the last one in list of candidates
                    candidate_emotion = utterance['candidates_emotions'][j]
                    candidate_act = utterance['candidates_acts'][j]
                    instance, _ = build_input_from_segments(topic, history, emotions, actions, candidate,
                                                            candidate_emotion, candidate_act, tokenizer, lm_labels)
                    #print(len(instance["input_ids"]))
                    if len(instance["input_ids"]) > 310:
                        truncated_history = [hist[:10] for hist in history]
                        truncated_candidate = candidate[:10]
                        instance, _ = build_input_from_segments(topic, truncated_history, emotions, actions, truncated_candidate,
                                                                candidate_emotion, candidate_act, tokenizer, lm_labels)
                        c+=1

                    for input_name, input_array in instance.items():
                        datasets[dataset_name][input_name].append(input_array)
                datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
                datasets[dataset_name]["n_candidates"] = num_candidates
    print(c)
    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler
コード例 #2
0
def get_data_loaders(config, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset_for_daily_dialog(tokenizer, config.dataset_path, config.dataset_cache, SPECIAL_TOKENS)

    # personachat["train"] = personachat["train"][:100]
    # personachat["valid"] = personachat["valid"][:10]

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    gpu_max_length = 310
    for dataset_name, dataset in personachat.items():
        num_candidates = 2#len(dataset[0]["utterances"][0]["candidates"])
        if config.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(config.num_candidates, num_candidates)
        for dialog in dataset:
            for utterance in dialog["utterances"]:
                history = utterance["history"][-(2 * config.max_history + 1):]
                emotions = utterance["emotion"][-(2 * config.max_history + 1):]
                reply = utterance["candidates"][-1]
                true_emotion = utterance['candidates_emotions'][-1]
                if true_emotion == tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS)[4]:
                   continue
                instance, _ = build_input_from_segments(history,
                                                        emotions,
                                                        reply,
                                                        true_emotion,
                                                        tokenizer)

                if len(instance["input_ids"]) > gpu_max_length:
                    truncated_history = [hist[:10] for hist in history]
                    truncated_candidate = reply[:10]
                    true_emotion = utterance['candidates_emotions'][-1]
                    instance, _ = build_input_from_segments(truncated_history,
                                                            emotions,
                                                            truncated_candidate,
                                                            true_emotion,
                                                            tokenizer)


                for input_name, input_array in instance.items():
                    datasets[dataset_name][input_name].append(input_array)

                datasets[dataset_name]["n_candidates"] = num_candidates

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            #if input_name != "mc_labels":
            #    tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if config.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if config.distributed else None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=config.train_batch_size, shuffle=False)
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=config.valid_batch_size, shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler