Beispiel #1
0
def get_dataset(args, datasets, data_dir, tokenizer, split_name):
    datasets = datasets.split(',')
    dataset_dict = None
    dataset_name=''
    for dataset in datasets:
        dataset_name += f'_{dataset}'
        dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset}')
        dataset_dict = util.merge(dataset_dict, dataset_dict_curr)
    data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, dataset_name, split_name)
    return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict
Beispiel #2
0
def get_dataset_eda_revised(args, datasets, data_dir, tokenizer, split_name, train_fraction):
    datasets = datasets.split(',')
    dataset_dict = None
    dataset_name=''
    for dataset in datasets:
        dataset_name += f'_{dataset}'
        # dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset}')
        dataset_dict_curr = xuran_perform_eda.perform_eda(f'{data_dir}/{dataset}', dataset, train_fraction)
        dataset_dict = util.merge(dataset_dict, dataset_dict_curr)
    data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, dataset_name, split_name)
    return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict
Beispiel #3
0
def get_dataset(args, dataset, data_dir, tokenizer, split_name, dataset_idx=None):
    if type(dataset) is list:
        output_name = ''
        dataset_dict = None
        for dataset_name in dataset:
            output_name += f'_{dataset_name}'
            dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset_name}')
            dataset_dict = util.merge(dataset_dict, dataset_dict_curr)
    else:
        output_name = f'_{dataset}'
        dataset_dict = util.read_squad(f'{data_dir}/{dataset}')
    data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, output_name, split_name, dataset_idx)
    return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict