def make_etr_dataset_v1(args): """ETRI 데이터 셋 가져오는 함수 1. 문서 길이 512이하 Filtering 2. 중복 Context 제거, Question 최대 4개 3. ans_start 위치로 3000개 샘플링 """ etr_dataset_path = p.join(args.path.train_data_dir, "etr_dataset_v1") if p.exists(etr_dataset_path): raise FileExistsError(f"{etr_dataset_path}는 이미 존재하는 파일입니다!") etr_dataset = get_etr_dataset(args) # (1) 문서 길이: KLUE MRC 512가 최소 길이 etr_dataset = filtering_by_doc_len(etr_dataset, doc_len=512) # (2) 중복 Context 제거: Context당 최대 4개의 질문 etr_dataset = filtering_by_dup_question(etr_dataset, dup_limit=4) # (3) ETR answer_start Weight 3000개 Sampling etr_dataset = sampling_by_ans_start_weights(etr_dataset, sample=3000) # (4) ETR_DATASET만 저장 etr_datasets = DatasetDict({"train": etr_dataset}) etr_datasets.save_to_disk(etr_dataset_path) print(f"{etr_dataset_path}에 저장되었습니다!")
def make_kor_dataset_v1(args): """KorQuad Dataset V1 1. 문서 길이 512이하 Filtering 2. Context당 Question 최대 4개 3. ans_start 위치로 8000개 샘플링 """ kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset") if p.exists(kor_dataset_path): raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!") kor_dataset = load_dataset("squad_kor_v1") kor_dataset = concatenate_datasets([ kor_dataset["train"].flatten_indices(), kor_dataset["validation"].flatten_indices() ]) # (1) 문서 길이: KLUE MRC 512가 최소 길이 kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512) # (2) 중복 Context 제거: Context당 최대 4개의 질문 kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4) # (3) KOR answer_start Weight Sampling 2배수 사용 kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000) # (4) KOR_DATASET만 저장 kor_datasets = DatasetDict({"train": kor_dataset}) kor_datasets.save_to_disk(kor_dataset_path) print(f"{kor_dataset_path}에 저장되었습니다!")
def test_push_streaming_dataset_dict_to_hub(self): ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}) local_ds = DatasetDict({"train": ds}) with tempfile.TemporaryDirectory() as tmp: local_ds.save_to_disk(tmp) local_ds = load_dataset(tmp, streaming=True) ds_name = f"{USER}/test-{int(time.time() * 10e3)}" try: local_ds.push_to_hub(ds_name, token=self._token) hub_ds = load_dataset(ds_name, download_mode="force_redownload") self.assertDictEqual(local_ds.column_names, hub_ds.column_names) self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys())) self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features) finally: self.cleanup_repo(ds_name)
def createDataset(config): """ build dataset from the h5 file also filter out rare *individual ATU* """ df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key) atu = df.loc[df.groupby("atu")["atu"].filter( lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index] atu = atu[["text", "atu", "desc", "label"]] dataset = Dataset.from_pandas(atu) tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"]) def tokenize(instance): return tokenizer(instance["text"], max_length=config["module"]["seq_len"], truncation="longest_first", padding="max_length") dataset = dataset. \ shuffle(seed=config.seed). \ map(tokenize, batched=True) # split by cls (stratified) sub_ds = {"train": [], "test": []} for cls in np.unique(dataset["label"]): cls_ds = dataset. \ filter(lambda d: d['label'] == int(cls)) cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio, seed=config.seed) sub_ds["train"].append(cls_ds["train"]) sub_ds["test"].append(cls_ds["test"]) dataset = DatasetDict( {split: concatenate_datasets(ds) for split, ds in sub_ds.items()}) dataset.save_to_disk(config.data.cached_dir) return dataset
def build_datasets( data_args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, cache_dir=None, skip_train=False, skip_eval=False) -> Tuple[Dataset, Dataset]: if skip_eval and skip_train: logger.warning("Both `skip_train` and `skip_eval` are set to True") json_path = data_args.data_json data_dir = data_args.load_data_from add_line_breaks = data_args.add_line_breaks break_token = data_args.line_break_token train_data, eval_data = None, None dataset = DatasetDict() if add_line_breaks: tokenizer.add_special_tokens(dict(additional_special_tokens=[break_token])) if json_path is not None: logger.info("Preprocessing new dataset from {}".format(json_path)) eval_split = data_args.eval_split save_dir = data_args.save_data_to dataset = load_dataset('json', data_files=[json_path], cache_dir=cache_dir) if eval_split < 1: dataset = dataset["train"].train_test_split(test_size=eval_split, shuffle=False) if save_dir is None: # Spend less time on preprocessing if skip_train: del dataset["train"] if skip_eval and "test" in dataset: del dataset["test"] if not data_args.skip_text_clean: normalize = partial(normalize_text, add_line_breaks=add_line_breaks, brk=break_token) dataset = dataset.map(normalize, input_columns='text') proc_kwargs = dict( batched=True, batch_size=data_args.tokenizer_batch_size, remove_columns=["text", "title"]) if "train" in dataset: proc_train = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.max_target_length) dataset["train"] = dataset["train"].map(proc_train, **proc_kwargs) if "test" in dataset: proc_eval = create_preprocess_fn( tokenizer, data_args.max_source_length, data_args.val_max_target_length) dataset["test"] = dataset["test"].map(proc_eval, **proc_kwargs) dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]) save_dir = data_args.save_data_to if save_dir is not None: if not os.path.exists(save_dir): os.mkdir(save_dir) logger.info("Saving preprocessed dataset to {}".format(save_dir)) dataset.save_to_disk(save_dir) elif data_dir is not None: logger.info("Loading preprocessed dataset from {}".format(data_dir)) if skip_train: eval_data = load_from_disk(os.path.join(data_dir, "test")) elif skip_eval: train_data = load_from_disk(os.path.join(data_dir, "train")) else: dataset = load_from_disk(data_dir) else: raise AttributeError("You must provide either `--data_json` or `--load_data_from` argument.") if "train" in dataset: train_data = dataset["train"] if "test" in dataset: eval_data = dataset["test"] return train_data, eval_data