def get_dataset(args, datasets, data_dir, tokenizer, split_name): datasets = datasets.split(',') dataset_dict = None dataset_name='' for dataset in datasets: dataset_name += f'_{dataset}' dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset}') dataset_dict = util.merge(dataset_dict, dataset_dict_curr) data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, dataset_name, split_name) return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict
def get_dataset_eda_revised(args, datasets, data_dir, tokenizer, split_name, train_fraction): datasets = datasets.split(',') dataset_dict = None dataset_name='' for dataset in datasets: dataset_name += f'_{dataset}' # dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset}') dataset_dict_curr = xuran_perform_eda.perform_eda(f'{data_dir}/{dataset}', dataset, train_fraction) dataset_dict = util.merge(dataset_dict, dataset_dict_curr) data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, dataset_name, split_name) return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict
def get_dataset(args, dataset, data_dir, tokenizer, split_name, dataset_idx=None): if type(dataset) is list: output_name = '' dataset_dict = None for dataset_name in dataset: output_name += f'_{dataset_name}' dataset_dict_curr = util.read_squad(f'{data_dir}/{dataset_name}') dataset_dict = util.merge(dataset_dict, dataset_dict_curr) else: output_name = f'_{dataset}' dataset_dict = util.read_squad(f'{data_dir}/{dataset}') data_encodings = read_and_process(args, tokenizer, dataset_dict, data_dir, output_name, split_name, dataset_idx) return util.QADataset(data_encodings, train=(split_name=='train')), dataset_dict