Esempio n. 1
0
def make_etr_dataset_v1(args):
    """ETRI 데이터 셋 가져오는 함수
    1. 문서 길이 512이하 Filtering
    2. 중복 Context 제거, Question 최대 4개
    3. ans_start 위치로 3000개 샘플링
    """
    etr_dataset_path = p.join(args.path.train_data_dir, "etr_dataset_v1")

    if p.exists(etr_dataset_path):
        raise FileExistsError(f"{etr_dataset_path}는 이미 존재하는 파일입니다!")

    etr_dataset = get_etr_dataset(args)

    # (1) 문서 길이: KLUE MRC 512가 최소 길이
    etr_dataset = filtering_by_doc_len(etr_dataset, doc_len=512)

    # (2) 중복 Context 제거: Context당 최대 4개의 질문
    etr_dataset = filtering_by_dup_question(etr_dataset, dup_limit=4)

    # (3) ETR answer_start Weight 3000개 Sampling
    etr_dataset = sampling_by_ans_start_weights(etr_dataset, sample=3000)

    # (4) ETR_DATASET만 저장
    etr_datasets = DatasetDict({"train": etr_dataset})
    etr_datasets.save_to_disk(etr_dataset_path)

    print(f"{etr_dataset_path}에 저장되었습니다!")
    def test_push_dataset_dict_to_hub_custom_features(self):
        features = Features({
            "x": Value("int64"),
            "y": ClassLabel(names=["neg", "pos"])
        })
        ds = Dataset.from_dict({
            "x": [1, 2, 3],
            "y": [0, 0, 1]
        },
                               features=features)

        local_ds = DatasetDict({"test": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["test"].features.keys()),
                                 list(hub_ds["test"].features.keys()))
            self.assertDictEqual(local_ds["test"].features,
                                 hub_ds["test"].features)
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
Esempio n. 3
0
    def load_data(
        self,
        data: Any,
        columns: List[str] = ["input_ids", "attention_mask", "labels"]
    ) -> 'datasets.Dataset':
        file, input, target = data
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING:
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except Exception:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)
        else:
            dataset_dict = load_dataset(self.filetype, data_files=data_files)

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input,
                                                target=target),
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
Esempio n. 4
0
    def test_push_dataset_dict_to_hub_multiple_files(self):
        ds = Dataset.from_dict({
            "x": list(range(1000)),
            "y": list(range(1000))
        })

        local_ds = DatasetDict({"train": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name,
                                 token=self._token,
                                 shard_size=500 << 5)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["train"].features.keys()),
                                 list(hub_ds["train"].features.keys()))
            self.assertDictEqual(local_ds["train"].features,
                                 hub_ds["train"].features)

            # Ensure that there are two files on the repository that have the correct name
            files = sorted(
                self._api.list_repo_files(ds_name,
                                          repo_type="dataset",
                                          token=self._token))
            self.assertListEqual(files, [
                ".gitattributes", "data/train-00000-of-00002.parquet",
                "data/train-00001-of-00002.parquet"
            ])
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  token=self._token,
                                  repo_type="dataset")
Esempio n. 5
0
    def load_data(
        self,
        file: str,
        use_full: bool = True,
        columns: List[str] = ["input_ids", "attention_mask", "labels"]
    ) -> 'datasets.Dataset':
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if use_full and os.getenv("FLASH_TESTING", "0") == "0":
            dataset_dict = load_dataset(self.filetype, data_files=data_files)
        else:
            # used for debugging. Avoid processing the entire dataset   # noqa E265
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except AssertionError:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)

        dataset_dict = dataset_dict.map(self._tokenize_fn_wrapped,
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
Esempio n. 6
0
    def __init__(self, pretrained, prepared_dir, classifier_dir):
        """
        pretrained is None means disable classifier
        """
        self.pretrained = pretrained
        self.classifier_dir = classifier_dir
        self.prepared_dir = prepared_dir
        self.datasets = DatasetDict({
            'train':
            read_dataset_from_csv(prepared_dir + '/train.csv'),
            'test':
            read_dataset_from_csv(prepared_dir + '/test.csv'),
            'validation':
            read_dataset_from_csv(prepared_dir + '/validation.csv')
        })
        self.metric = load_metric("seqeval")
        self.label_list = self.datasets["train"].features["tag"].feature.names
        check_folder(self.classifier_dir)

        if pretrained:
            self.model = AutoModelForTokenClassification.from_pretrained(
                self.pretrained, num_labels=len(self.label_list))
            self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained)
            self.data_collator = DataCollatorForTokenClassification(
                self.tokenizer)
Esempio n. 7
0
    def test_push_dataset_dict_to_hub_no_token(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        local_ds = DatasetDict({"train": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys()))
            self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features)

            # Ensure that there is a single file on the repository that has the correct name
            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset"))
            self.assertTrue(
                all(
                    fnmatch.fnmatch(file, expected_file)
                    for file, expected_file in zip(
                        files, [".gitattributes", "data/train-00000-of-00001-*.parquet", "dataset_infos.json"]
                    )
                )
            )
        finally:
            self.cleanup_repo(ds_name)
Esempio n. 8
0
    def test_push_dataset_dict_to_hub_multiple_files(self):
        ds = Dataset.from_dict({"x": list(range(1000)), "y": list(range(1000))})

        local_ds = DatasetDict({"train": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token, max_shard_size="16KB")
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["train"].features.keys()), list(hub_ds["train"].features.keys()))
            self.assertDictEqual(local_ds["train"].features, hub_ds["train"].features)

            # Ensure that there are two files on the repository that have the correct name
            files = sorted(self._api.list_repo_files(ds_name, repo_type="dataset", token=self._token))
            self.assertTrue(
                all(
                    fnmatch.fnmatch(file, expected_file)
                    for file, expected_file in zip(
                        files,
                        [
                            ".gitattributes",
                            "data/train-00000-of-00002-*.parquet",
                            "data/train-00001-of-00002-*.parquet",
                            "dataset_infos.json",
                        ],
                    )
                )
            )
        finally:
            self.cleanup_repo(ds_name)
    def test_push_dataset_dict_to_hub_name_without_namespace(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        local_ds = DatasetDict({"train": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["train"].features.keys()),
                                 list(hub_ds["train"].features.keys()))
            self.assertDictEqual(local_ds["train"].features,
                                 hub_ds["train"].features)

            # Ensure that there is a single file on the repository that has the correct name
            files = sorted(
                self._api.list_repo_files(ds_name, repo_type="dataset"))
            self.assertListEqual(files, [
                ".gitattributes", "data/train-00000-of-00001.parquet",
                "dataset_infos.json"
            ])
        finally:
            self._api.delete_repo(ds_name.split("/")[1],
                                  organization=ds_name.split("/")[0],
                                  repo_type="dataset")
Esempio n. 10
0
def make_kor_dataset_v1(args):
    """KorQuad Dataset V1
    1. 문서 길이 512이하 Filtering
    2. Context당 Question 최대 4개
    3. ans_start 위치로 8000개 샘플링
    """

    kor_dataset_path = p.join(args.path.train_data_dir, "kor_dataset")

    if p.exists(kor_dataset_path):
        raise FileExistsError(f"{kor_dataset_path}는 이미 존재하는 파일입니다!")

    kor_dataset = load_dataset("squad_kor_v1")

    kor_dataset = concatenate_datasets([
        kor_dataset["train"].flatten_indices(),
        kor_dataset["validation"].flatten_indices()
    ])

    # (1) 문서 길이: KLUE MRC 512가 최소 길이
    kor_dataset = filtering_by_doc_len(kor_dataset, doc_len=512)

    # (2) 중복 Context 제거: Context당 최대 4개의 질문
    kor_dataset = filtering_by_dup_question(kor_dataset, dup_limit=4)

    # (3) KOR answer_start Weight Sampling 2배수 사용
    kor_dataset = sampling_by_ans_start_weights(kor_dataset, sample=8000)

    # (4) KOR_DATASET만 저장
    kor_datasets = DatasetDict({"train": kor_dataset})
    kor_datasets.save_to_disk(kor_dataset_path)

    print(f"{kor_dataset_path}에 저장되었습니다!")
Esempio n. 11
0
    def check_model(self, model):
        raw_datasets = DatasetDict()
        raw_datasets["eval"] = load_dataset("superb", "ks", split="validation")
        raw_datasets = raw_datasets.cast_column(
            "audio", datasets.features.Audio(sampling_rate=16000))

        sample = raw_datasets["eval"][0]
        out = model(sample["audio"]["array"].reshape(1, 16000))

        self.assertEqual(np.argmax(out.logits), 11)
Esempio n. 12
0
    def load_data(self,
                  data: str,
                  dataset: Optional[Any] = None) -> "datasets.Dataset":
        stage = self._running_stage.value

        file_path = data

        path = Path(file_path)
        with open(path, "rb") as f:
            squad_v_2_dict = json.load(f)

        ids = []
        titles = []
        contexts = []
        questions = []
        answers = []
        for topic in squad_v_2_dict["data"]:
            title = topic["title"]
            for comprehension in topic["paragraphs"]:
                context = comprehension["context"]
                for qa in comprehension["qas"]:
                    question = qa["question"]
                    id = qa["id"]

                    _answer_starts = [
                        answer["answer_start"] for answer in qa["answers"]
                    ]
                    _answers = [answer["text"] for answer in qa["answers"]]

                    ids.append(id)
                    titles.append(title)
                    contexts.append(context)
                    questions.append(question)
                    answers.append(
                        dict(text=_answers, answer_start=_answer_starts))

        dataset_dict = DatasetDict({
            stage:
            Dataset.from_dict({
                "id": ids,
                "title": titles,
                "context": contexts,
                "question": questions,
                "answer": answers
            })
        })

        column_names = dataset_dict[stage].column_names

        dataset_dict = dataset_dict.map(self._tokenize_fn,
                                        batched=True,
                                        remove_columns=column_names)

        return dataset_dict[stage]
Esempio n. 13
0
    def test_push_dataset_dict_to_hub_datasets_with_different_features(self):
        ds_train = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})
        ds_test = Dataset.from_dict({"x": [True, False, True], "y": ["a", "b", "c"]})

        local_ds = DatasetDict({"train": ds_train, "test": ds_test})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            with self.assertRaises(ValueError):
                local_ds.push_to_hub(ds_name.split("/")[-1], token=self._token)
        except AssertionError:
            self.cleanup_repo(ds_name)
            raise
Esempio n. 14
0
    def load_data(self,
                  data: Any,
                  columns: List[str] = None) -> "datasets.Dataset":
        stage = self._running_stage.value

        dataset_dict = DatasetDict({stage: Dataset.from_dict(data)})

        column_names = dataset_dict[stage].column_names

        dataset_dict = dataset_dict.map(self._tokenize_fn,
                                        batched=True,
                                        remove_columns=column_names)

        return dataset_dict[stage]
Esempio n. 15
0
    def test_push_dataset_dict_to_hub_custom_splits(self):
        ds = Dataset.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]})

        local_ds = DatasetDict({"random": ds})

        ds_name = f"{USER}/test-{int(time.time() * 10e3)}"
        try:
            local_ds.push_to_hub(ds_name, token=self._token)
            hub_ds = load_dataset(ds_name, download_mode="force_redownload")

            self.assertDictEqual(local_ds.column_names, hub_ds.column_names)
            self.assertListEqual(list(local_ds["random"].features.keys()), list(hub_ds["random"].features.keys()))
            self.assertDictEqual(local_ds["random"].features, hub_ds["random"].features)
        finally:
            self.cleanup_repo(ds_name)
def get_correct_dataset_TUWS(wrong_key=False):

    labeled, unlabeled = correct_examples()

    train_dic = Dataset.from_dict({
        'sentence':
        labeled['sentence'] + unlabeled['sentence'],
        'label':
        labeled['label'] + unlabeled['label']
    })

    if wrong_key is False:
        return DatasetDict({'train': train_dic})
    else:
        return DatasetDict({'training_Data': train_dic})
Esempio n. 17
0
def build_dataloader(location, shuffle_dataset, sampling_fraction, config, collate_fn, tokenizer, continuous_iter=True, world_size=1, num_workers=1):
    size_dicts = {128: 64*8, 256: 32*8, 512: 16*8, 768: 8*8, 1024: 8*8}
    # TODO: num workers based on dataset size, only top 16 datasets get 2 workers, next 16 get 1 worker and rest are done in main process
    single_node = world_size == 1
    try:
        train_dataset = Dataset.load_from_disk(location)
        train_dataset = TokenizerDataset(config, tokenizer, char_to_id, dict(padding="max_length", truncation=True, return_tensors="pt", max_length=config.tokenizer_length), train_dataset)
        if num_workers > 0:
            train_loader = DataLoader(train_dataset, sampler=None if single_node else DistributedSampler(train_dataset, shuffle=shuffle_dataset), batch_size=8*8, collate_fn=None, prefetch_factor=4 if num_workers > 0 else None, num_workers=(2*num_workers) if single_node else num_workers)
        else:
            train_loader = DataLoader(train_dataset, sampler=None if single_node else DistributedSampler(train_dataset, shuffle=shuffle_dataset), batch_size=8*8,
                                      collate_fn=None,
                                      num_workers=(2 * num_workers) if single_node else num_workers)
        train_loader = custom_batching_fn(train_loader, size_dicts, continuous_iter)
    except:
        train_dataset = DatasetDict.load_from_disk(location)
        train_dataset = {k: v for k, v in train_dataset.items() if len(v) >= world_size}
        train_dataset_sampling_proba = {k: len(v) ** sampling_fraction for k, v in train_dataset.items()}
        lsum = sum(train_dataset_sampling_proba.values())
        train_dataset_sampling_proba = {k: v / lsum for k, v in train_dataset_sampling_proba.items()}
        train_dataset = {k: TokenizerDataset(config, tokenizer, char_to_id, dict(padding="max_length", truncation=True, return_tensors="pt", max_length=config.tokenizer_length), v) for k, v in train_dataset.items()}
        # for v in train_dataset.values():
        #     v.training = False
        if num_workers > 0:
            train_loader = {k: DataLoader(v, sampler=None if single_node else DistributedSampler(v, shuffle=shuffle_dataset, ), batch_size=8*8, collate_fn=collate_fn, prefetch_factor=2, num_workers=(2*num_workers) if single_node else num_workers) for k, v in train_dataset.items()}
        else:
            train_loader = {
                k: DataLoader(v, sampler=None if single_node else DistributedSampler(v, shuffle=shuffle_dataset, ), batch_size=8*8, collate_fn=collate_fn,
                              num_workers=(2 * num_workers) if single_node else num_workers) for k, v in train_dataset.items()}
        train_loader = {k: custom_batching_fn(dataloader, size_dicts, continuous_iter) for k, dataloader in train_loader.items()}
        train_loader = datadict_iterator(train_loader, train_dataset_sampling_proba)
    return train_loader
def create_vocabulary_from_data(datasets: DatasetDict):
    # Given training and test labels create vocabulary
    def extract_all_chars(batch):
        all_text = " ".join(batch["target_text"])
        vocab = list(set(all_text))
        return {"vocab": [vocab], "all_text": [all_text]}

    vocabs = datasets.map(
        extract_all_chars,
        batched=True,
        batch_size=-1,
        keep_in_memory=True,
        remove_columns=datasets["train"].column_names,
    )

    # take union of all unique characters in each dataset
    vocab_set = functools.reduce(
        lambda vocab_1, vocab_2: set(vocab_1["vocab"][0]) | set(vocab_2["vocab"
                                                                        ][0]),
        vocabs.values())

    vocab_dict = {v: k for k, v in enumerate(sorted(list(vocab_set)))}

    # replace white space with delimiter token
    vocab_dict["|"] = vocab_dict[" "]
    del vocab_dict[" "]

    # add unk and pad token
    vocab_dict["[UNK]"] = len(vocab_dict)
    vocab_dict["[PAD]"] = len(vocab_dict)

    return vocab_dict
Esempio n. 19
0
def concate(dataset_name, data, cache_dir):
    if dataset_name in dataset_types:
        all_datasets_downloaded = [
            load_dataset(dataset_name, sub_dataset, cache_dir=cache_dir)
            for sub_dataset in dataset_types[dataset_name]
        ]
        combined_datasets = [
            concatenate_datasets(list(sub_dataset.values()))
            for sub_dataset in all_datasets_downloaded
        ]
        data = concatenate_datasets(combined_datasets)
        return DatasetDict({"train": data})
    data = concatenate_datasets(
        list(load_dataset(dataset_name, cache_dir=cache_dir).values())
    )
    return DatasetDict({"train": data})
def save_data(train_df, val_df):
    train_f = Features({
        'answers':
        Sequence(feature={
            'text': Value(dtype='string', id=None),
            'answer_start': Value(dtype='int32', id=None)
        },
                 length=-1,
                 id=None),
        'context':
        Value(dtype='string', id=None),
        'id':
        Value(dtype='string', id=None),
        'question':
        Value(dtype='string', id=None),
        'question_type':
        Value(dtype='int32', id=None)
    })

    train_datasets = DatasetDict({
        'train':
        Dataset.from_pandas(train_df, features=train_f),
        'validation':
        Dataset.from_pandas(val_df, features=train_f)
    })
    file = open("../../data/question_type.pkl", "wb")
    pickle.dump(train_datasets, file)
    file.close()
Esempio n. 21
0
    def load_data(
        self,
        data: Tuple[str, Union[str, List[str]], Union[str, List[str]]],
        dataset: Optional[Any] = None,
        columns: Union[List[str],
                       Tuple[str]] = ("input_ids", "attention_mask", "labels"),
    ) -> Union[Sequence[Mapping[str, Any]]]:
        csv_file, input, target = data

        data_files = {}

        stage = self.running_stage.value
        data_files[stage] = str(csv_file)

        # FLASH_TESTING is set in the CI to run faster.
        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING and not torch.cuda.is_available():
            try:
                dataset_dict = DatasetDict({
                    stage:
                    load_dataset(self.filetype,
                                 data_files=data_files,
                                 split=[f'{stage}[:20]'])[0]
                })
            except Exception:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)
        else:
            dataset_dict = load_dataset(self.filetype, data_files=data_files)

        if self.training:
            labels = list(sorted(list(set(dataset_dict[stage][target]))))
            dataset.num_classes = len(labels)
            self.set_state(LabelsState(labels))

        labels = self.get_state(LabelsState)

        # convert labels to ids
        # if not self.predicting:
        if labels is not None:
            labels = labels.labels
            label_to_class_mapping = {v: k for k, v in enumerate(labels)}
            dataset_dict = dataset_dict.map(
                partial(self._transform_label, label_to_class_mapping, target))

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input),
                                        batched=True)

        # Hugging Face models expect target to be named ``labels``.
        if not self.predicting and target != "labels":
            dataset_dict.rename_column_(target, "labels")

        dataset_dict.set_format("torch", columns=columns)

        return dataset_dict[stage]
def get_dataset_cotrain(wrong_key=False):

    labeled, _ = correct_examples()

    unlabeled = Dataset.from_dict(
        {'sentence': ['moon what??.', 'I am people']})

    if wrong_key is False:
        return DatasetDict({
            'labeled1': labeled,
            'labeled2': labeled,
            'unlabeled': unlabeled
        })
    else:
        return DatasetDict({
            'labeled1': labeled,
            'labeled2': labeled,
            'unlabels': unlabeled
        })
Esempio n. 23
0
 def load_eval_data(self, force_reload=False, save_datasets=True) -> None:
     eval_save_dir = self.save_dir / "eval"
     try:
         if force_reload:
             raise Exception()
         self.datasets["eval"] = DatasetDict.load_from_disk(eval_save_dir)
         print("Evaluation data loaded from disk.")
     except:
         print("Regenerating evaluation data.")
         eval_df_dict = self._parse_eval_data(self.eval_dir)
         self.datasets["eval"] = DatasetDict({
             "far":
             Dataset.from_pandas(eval_df_dict["far"]),
             "obj":
             Dataset.from_pandas(eval_df_dict["obj"]),
         })
         if save_datasets:
             print(f"Saving evaluation dataset to {eval_save_dir}")
             self.datasets["eval"].save_to_disk(eval_save_dir)
Esempio n. 24
0
def split_relabel_jigsaw_severetoxic(dataset):

    dataset = dataset.rename_column("severe_toxic", "labels")
    train_val = dataset['train'].train_test_split(test_size=0.25)
    dataset = DatasetDict({
        'train': train_val['train'],
        'test': dataset['test'],
        'validation': train_val['test']
    })

    return dataset
Esempio n. 25
0
def split_relabel_jigsaw_identityhate(dataset):

    dataset = dataset.rename_column("identity_hate", "labels")
    train_val = dataset['train'].train_test_split(test_size=0.25)
    dataset = DatasetDict({
        'train': train_val['train'],
        'test': dataset['test'],
        'validation': train_val['test']
    })

    return dataset
Esempio n. 26
0
    def load_data(self,
                  data: Any,
                  columns: List[str] = None) -> "datasets.Dataset":
        if columns is None:
            columns = ["input_ids", "attention_mask", "labels"]
        if self.filetype == "json":
            file, input, target, field = data
        else:
            file, input, target = data
        data_files = {}
        stage = self._running_stage.value
        data_files[stage] = str(file)

        # FLASH_TESTING is set in the CI to run faster.
        if flash._IS_TESTING:
            try:
                if self.filetype == "json" and field is not None:
                    dataset_dict = DatasetDict({
                        stage:
                        load_dataset(self.filetype,
                                     data_files=data_files,
                                     split=[f"{stage}[:20]"],
                                     field=field)[0]
                    })
                else:
                    dataset_dict = DatasetDict({
                        stage:
                        load_dataset(self.filetype,
                                     data_files=data_files,
                                     split=[f"{stage}[:20]"])[0]
                    })
            except Exception:
                if self.filetype == "json" and field is not None:
                    dataset_dict = load_dataset(self.filetype,
                                                data_files=data_files,
                                                field=field)
                else:
                    dataset_dict = load_dataset(self.filetype,
                                                data_files=data_files)
        else:
            if self.filetype == "json" and field is not None:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files,
                                            field=field)
            else:
                dataset_dict = load_dataset(self.filetype,
                                            data_files=data_files)

        dataset_dict = dataset_dict.map(partial(self._tokenize_fn,
                                                input=input,
                                                target=target),
                                        batched=True)
        dataset_dict.set_format(columns=columns)
        return dataset_dict[stage]
Esempio n. 27
0
def createDataset(config):
    """
    build dataset from the h5 file
    also filter out rare *individual ATU*
    """
    df = pd.read_hdf(config.data.h5_file, key=config.data.h5_key)
    atu = df.loc[df.groupby("atu")["atu"].filter(
        lambda g: len(g) >= config["datamodules"]["atu_filter_no"]).index]
    atu = atu[["text", "atu", "desc", "label"]]

    dataset = Dataset.from_pandas(atu)
    tokenizer = AutoTokenizer.from_pretrained(config["module"]["arch"])

    def tokenize(instance):
        return tokenizer(instance["text"],
                         max_length=config["module"]["seq_len"],
                         truncation="longest_first",
                         padding="max_length")

    dataset = dataset. \
        shuffle(seed=config.seed). \
        map(tokenize, batched=True)

    # split by cls (stratified)
    sub_ds = {"train": [], "test": []}
    for cls in np.unique(dataset["label"]):
        cls_ds = dataset. \
            filter(lambda d: d['label'] == int(cls))
        cls_ds = cls_ds.train_test_split(test_size=config.data.test_ratio,
                                         seed=config.seed)
        sub_ds["train"].append(cls_ds["train"])
        sub_ds["test"].append(cls_ds["test"])

    dataset = DatasetDict(
        {split: concatenate_datasets(ds)
         for split, ds in sub_ds.items()})
    dataset.save_to_disk(config.data.cached_dir)
    return dataset
Esempio n. 28
0
def run_sparse_retrieval(datasets, training_args):
    #### retreival process ####

    retriever = SparseRetrieval(tokenize_fn=tokenize,
                                data_path="./data",
                                context_path="wikipedia_documents.json"
                                # context_path="all_wikipedia_documents.json"
                                )
    # sparse embedding retrieval
    # retriever.get_sparse_embedding()
    #df = retriever.retrieve(datasets['validation'])

    # bm25 retrieval
    # retriever.get_embedding_BM25()
    # df = retriever.retrieve_BM25(query_or_dataset=datasets['validation'], topk=10)

    # elastic search retrieval
    # retriever.get_elastic_search()
    df = retriever.retrieve_ES(query_or_dataset=datasets['validation'],
                               topk=10)

    # faiss retrieval
    # df = retriever.retrieve_faiss(dataset['validation'])

    if training_args.do_predict:  # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
        f = Features({
            'context': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    elif training_args.do_eval:  # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets
Esempio n. 29
0
    def load_data(self,
                  filepath: str,
                  dataset: AutoDataset,
                  columns: Union[List[str],
                                 Tuple[str]] = ("input_ids", "attention_mask",
                                                "labels"),
                  use_full: bool = True):
        data_files = {}

        stage = dataset.running_stage.value
        data_files[stage] = str(filepath)

        # FLASH_TESTING is set in the CI to run faster.
        if use_full and os.getenv("FLASH_TESTING", "0") == "0":
            dataset_dict = load_dataset(self.filetype, data_files=data_files)
        else:
            # used for debugging. Avoid processing the entire dataset   # noqa E265
            dataset_dict = DatasetDict({
                stage:
                load_dataset(self.filetype,
                             data_files=data_files,
                             split=[f'{stage}[:20]'])[0]
            })

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # convert labels to ids
        if not self.predicting:
            dataset_dict = dataset_dict.map(self._transform_label)

        dataset_dict = dataset_dict.map(self._tokenize_fn, batched=True)

        # Hugging Face models expect target to be named ``labels``.
        if not self.predicting and self.target != "labels":
            dataset_dict.rename_column_(self.target, "labels")

        dataset_dict.set_format("torch", columns=columns)

        if not self.predicting:
            dataset.num_classes = len(self.label_to_class_mapping)

        return dataset_dict[stage]
Esempio n. 30
0
def run_sparse_retrieval(datasets, training_args, inf_args):
    #### retreival process ####
    if inf_args.retrieval == None:
        retriever = SparseRetrieval_BM25PLUS(
            tokenize_fn=tokenize,
            data_path="./data",
            context_path="wikipedia_documents.json")
    elif inf_args.retrieval.lower() == "sparse":
        retriever = SparseRetrieval(tokenize_fn=tokenize,
                                    data_path="./data",
                                    context_path="wikipedia_documents.json")
    # elif inf_args.retrieval.lower() == "bm25" or inf_args.retrieval.lower() == "bm25":
    #     retriever = SparseRetrieval_BM25(tokenize_fn=tokenize,
    #                                          data_path="./data",
    #                                          context_path="wikipedia_documents.json")

    retriever.get_sparse_embedding()
    df = retriever.retrieve(datasets['validation'], inf_args.k)

    # faiss retrieval

    # test data 에 대해선 정답이 없으므로 id question context 로만 데이터셋이 구성됩니다.
    if training_args.do_predict:
        f = Features({
            'contexts': Value(dtype='string', id=None),
            'id': Value(dtype='string', id=None),
            'question': Value(dtype='string', id=None)
        })

    # train data 에 대해선 정답이 존재하므로 id question context answer 로 데이터셋이 구성됩니다.
    elif training_args.do_eval:
        f = Features({
            'answers':
            Sequence(feature={
                'text': Value(dtype='string', id=None),
                'answer_start': Value(dtype='int32', id=None)
            },
                     length=-1,
                     id=None),
            'context':
            Value(dtype='string', id=None),
            'id':
            Value(dtype='string', id=None),
            'question':
            Value(dtype='string', id=None)
        })

    datasets = DatasetDict({'validation': Dataset.from_pandas(df, features=f)})
    return datasets