def _split_generators(self, dl_manager):
     _URL = "https://jingshensn2.github.io/eli5c/datasets/"
     downloaded_files = dl_manager.download_and_extract({
         "train":
         _URL + "eli5-category-train.json.gz",
         "val1":
         _URL + "eli5-category-validation-1.json.gz",
         "val2":
         _URL + "eli5-category-validation-2.json.gz",
         "test":
         _URL + "eli5-category-test.json.gz",
     })
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             gen_kwargs={"filepath": downloaded_files["train"]},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("validation1"),
             gen_kwargs={"filepath": downloaded_files["val1"]},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("validation2"),
             gen_kwargs={"filepath": downloaded_files["val2"]},
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             gen_kwargs={"filepath": downloaded_files["test"]},
         ),
     ]
Ejemplo n.º 2
0
    def _split_generators(self, dl_manager):
        all_data_folder = dl_manager.download_and_extract(_XGLUE_ALL_DATA)
        data_folder = os.path.join(all_data_folder, "xglue_full_dataset", self.config.data_dir)
        name = self.config.name

        languages = _LANGUAGES[name]
        return (
            [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"data_file": os.path.join(data_folder, _PATHS[name]["train"]), "split": "train"},
                ),
            ]
            + [
                datasets.SplitGenerator(
                    name=datasets.Split(f"validation.{lang}"),
                    gen_kwargs={
                        "data_file": os.path.join(data_folder, _PATHS[name]["dev"].format(lang)),
                        "split": "dev",
                    },
                )
                for lang in languages
            ]
            + [
                datasets.SplitGenerator(
                    name=datasets.Split(f"test.{lang}"),
                    gen_kwargs={
                        "data_file": os.path.join(data_folder, _PATHS[name]["test"].format(lang)),
                        "split": "test",
                    },
                )
                for lang in languages
            ]
        )
Ejemplo n.º 3
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     if self.config.name != "task4_reddit":
         my_urls = ZIP_URL  # Cannot download just one single type as it is a compressed file.
     else:
         my_urls = REDDIT_URL
     data_dir = dl_manager.download_and_extract(my_urls)
     splits = [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath":
                 os.path.join(data_dir, paths[self.config.name]["train"]),
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath":
                 os.path.join(data_dir, paths[self.config.name]["test"]),
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath":
                 os.path.join(data_dir, paths[self.config.name]["dev"]),
             },
         ),
     ]
     if self.config.name == "task4_reddit":
         splits += [
             datasets.SplitGenerator(
                 name=datasets.Split("cand_valid"),
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
                     "filepath":
                     os.path.join(data_dir,
                                  paths[self.config.name]["cand_valid"]),
                 },
             ),
             datasets.SplitGenerator(
                 name=datasets.Split("cand_test"),
                 # These kwargs will be passed to _generate_examples
                 gen_kwargs={
                     "filepath":
                     os.path.join(data_dir,
                                  paths[self.config.name]["cand_test"]),
                 },
             ),
         ]
     return splits
Ejemplo n.º 4
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     my_urls = _URL
     data_dir = dl_manager.download_and_extract(my_urls)
     data_path = os.path.join(data_dir, "wikipedia-biography-dataset")
     return [
         datasets.SplitGenerator(
             name=datasets.Split("train"),
             gen_kwargs={
                 "id_file":
                 os.path.join(data_path, "train", "train.id"),
                 "infobox_file":
                 os.path.join(data_path, "train", "train.box"),
                 "nb_lines_file":
                 os.path.join(data_path, "train", "train.nb"),
                 "sentences_file":
                 os.path.join(data_path, "train", "train.sent"),
                 "article_title_file":
                 os.path.join(data_path, "train", "train.title"),
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("test"),
             gen_kwargs={
                 "id_file":
                 os.path.join(data_path, "test", "test.id"),
                 "infobox_file":
                 os.path.join(data_path, "test", "test.box"),
                 "nb_lines_file":
                 os.path.join(data_path, "test", "test.nb"),
                 "sentences_file":
                 os.path.join(data_path, "test", "test.sent"),
                 "article_title_file":
                 os.path.join(data_path, "test", "test.title"),
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("val"),
             gen_kwargs={
                 "id_file":
                 os.path.join(data_path, "valid", "valid.id"),
                 "infobox_file":
                 os.path.join(data_path, "valid", "valid.box"),
                 "nb_lines_file":
                 os.path.join(data_path, "valid", "valid.nb"),
                 "sentences_file":
                 os.path.join(data_path, "valid", "valid.sent"),
                 "article_title_file":
                 os.path.join(data_path, "valid", "valid.title"),
             },
         ),
     ]
Ejemplo n.º 5
0
 def _split_generators(self, dl_manager):
     qa_data_file = pjoin(
         self._cache_dir_root, self._relative_data_dir(with_version=False), "reddit_downloaded_qa_lists.json"
     )
     if isfile(qa_data_file):
         logger.info("loading pre-computed QA list")
         self.filtered_reddit = json.load(open(qa_data_file))
     else:
         self.filtered_reddit = _download_and_filter_reddit(
             dl_manager, start_year=2011, start_month=7, end_year=2019, end_month=7
         )
         logger.info("saving pre-computed QA list")
         json.dump(self.filtered_reddit, open(qa_data_file, "w"))
     # download data splits from AWS
     fpath_splits = dl_manager.download(self._DATA_SPLIT_URL)
     self.data_split = json.load(open(fpath_splits))
     return [
         datasets.SplitGenerator(
             name=datasets.Split("train_eli5"),
             gen_kwargs={"split": "train", "subreddit_name": "explainlikeimfive"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("validation_eli5"),
             gen_kwargs={"split": "validation", "subreddit_name": "explainlikeimfive"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("test_eli5"),
             gen_kwargs={"split": "test", "subreddit_name": "explainlikeimfive"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("train_asks"),
             gen_kwargs={"split": "train", "subreddit_name": "askscience"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("validation_asks"),
             gen_kwargs={"split": "validation", "subreddit_name": "askscience"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("test_asks"),
             gen_kwargs={"split": "test", "subreddit_name": "askscience"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("train_askh"),
             gen_kwargs={"split": "train", "subreddit_name": "AskHistorians"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("validation_askh"),
             gen_kwargs={"split": "validation", "subreddit_name": "AskHistorians"},
         ),
         datasets.SplitGenerator(
             name=datasets.Split("test_askh"),
             gen_kwargs={"split": "test", "subreddit_name": "AskHistorians"},
         ),
     ]
Ejemplo n.º 6
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        dl_dir = dl_manager.download_and_extract(self.config.data_url)

        splits_gen = []

        for split_id, split_filename in self.config.splits.items():
            if self.config.gameplay_scenario == "original":
                if "train" in split_id:
                    split_name = datasets.Split.TRAIN
                elif "valid" in split_id:
                    split_name = datasets.Split.VALIDATION
                elif "test" in split_id:
                    split_name = datasets.Split.TEST
            else:
                split_name = datasets.Split(split_id)

            full_split_name = "-".join(["compguesswhat", self.config.gameplay_scenario])
            splits_gen.append(
                datasets.SplitGenerator(
                    name=split_name,
                    gen_kwargs={
                        "filepath": os.path.join(
                            dl_dir,
                            full_split_name,
                            self.VERSION.version_str,
                            split_filename,
                        )
                    },
                )
            )

        return splits_gen
Ejemplo n.º 7
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        train_path = dl_manager.download_and_extract(_TRAIN_DOWNLOAD_URL)
        valid_path = dl_manager.download_and_extract(_VALID_DOWNLOAD_URL)
        test_lay_path = dl_manager.download_and_extract(_TEST_LAY_DOWNLOAD_URL)
        test_expert_path = dl_manager.download_and_extract(
            _TEST_EXPERT_DOWNLOAD_URL)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN,
                                    gen_kwargs={"filepath": train_path}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION,
                                    gen_kwargs={"filepath": valid_path}),
            datasets.SplitGenerator(name=datasets.Split("test_lay"),
                                    gen_kwargs={"filepath": test_lay_path}),
            datasets.SplitGenerator(name=datasets.Split("test_expert"),
                                    gen_kwargs={"filepath": test_expert_path}),
        ]
Ejemplo n.º 8
0
    def _split_generators(self, dl_manager):
        dl_dir = dl_manager.download_and_extract(self.config.data_url)
        data_dir = os.path.join(dl_dir, self.config.data_dir)

        if self.config.name in {"chid", "c3"}:
            test_file = "test1.1.json"
        elif self.config.name == "diagnostics":
            test_file = "diagnostics_test.json"
        else:
            test_file = "test.json"

        test_split = datasets.SplitGenerator(
            name=datasets.Split.TEST,
            gen_kwargs={
                "data_file": os.path.join(data_dir, test_file),
                "split": "test",
            },
        )

        split_list = [test_split]

        if self.config.name != "diagnostics":
            train_split = datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_file": os.path.join(
                        data_dir or "", "train.json" if self.config.name != "c3" else "d-train.json"
                    ),
                    "split": "train",
                },
            )
            val_split = datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "data_file": os.path.join(
                        data_dir or "", "dev.json" if self.config.name != "c3" else "d-dev.json"
                    ),
                    "split": "dev",
                },
            )
            split_list += [train_split, val_split]

        if self.config.name == "cmrc2018":
            split_list.append(
                datasets.SplitGenerator(
                    name=datasets.Split("trial"),
                    gen_kwargs={
                        "data_file": os.path.join(data_dir or "", "trial.json"),
                        "split": "trial",
                    },
                )
            )

        return split_list
Ejemplo n.º 9
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        downloaded_files = dl_manager.download_and_extract(_URLS)

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": downloaded_files["train"]}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": downloaded_files["dev"]}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": downloaded_files["test"]}),
            datasets.SplitGenerator(
                name=datasets.Split("test_wikipedia"), gen_kwargs={"filepath": downloaded_files["test_wikipedia"]}
            ),
        ]
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        # This method is tasked with downloading/extracting the data and defining the splits depending on the configuration
        # If several configurations are possible (listed in BUILDER_CONFIGS), the configuration selected by the user is in self.config.name

        data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))

        if not os.path.exists(data_dir):
            raise FileNotFoundError(
                "{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('jigsaw_unintended_bias', data_dir=...)`. Manual download instructions: {}"
                .format(data_dir, self.manual_download_instructions))

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "path": os.path.join(data_dir, "train.csv"),
                    "split": "train"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("test_private_leaderboard"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "path": os.path.join(data_dir,
                                         "test_private_expanded.csv"),
                    "split": "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("test_public_leaderboard"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "path": os.path.join(data_dir, "test_public_expanded.csv"),
                    "split": "test"
                },
            ),
        ]
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     data_dir = dl_manager.download_and_extract(_URL)
     return [
         datasets.SplitGenerator(
             name=datasets.Split("auxiliary_train"),
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "datadir": os.path.join(data_dir, "data",
                                         "auxiliary_train"),
                 "split": "auxiliary_train",
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "datadir": os.path.join(data_dir, "data", "test"),
                 "split": "test"
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.VALIDATION,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "datadir": os.path.join(data_dir, "data", "val"),
                 "split": "val",
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("dev"),
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "datadir": os.path.join(data_dir, "data", "dev"),
                 "split": "dev",
             },
         ),
     ]
Ejemplo n.º 12
0
 def _split_generators(self, dl_manager):
     archive = dl_manager.download(_DOWNLOAD_URL)
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train"}
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST, gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "test"}
         ),
         datasets.SplitGenerator(
             name=datasets.Split("unsupervised"),
             gen_kwargs={"files": dl_manager.iter_archive(archive), "split": "train", "labeled": False},
         ),
     ]
Ejemplo n.º 13
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""
        data_dir = dl_manager.download_and_extract(_URL)
        ROOT = "semeval-2020-task-7-dataset"

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name,
                                 "train.csv"),
                    "split":
                    "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name, "test.csv"),
                    "split":
                    "test"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name, "dev.csv"),
                    "split":
                    "dev",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("funlines"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath":
                    os.path.join(data_dir, ROOT, self.config.name,
                                 "train_funlines.csv"),
                    "split":
                    "funlines",
                },
            ),
        ]
Ejemplo n.º 14
0
 def _split_generators(self, mgr):
     all_data_folder = mgr.download_and_extract(_XGLUE_ALL_DATA)
     data_folder = os.path.join(all_data_folder, "xglue_full_dataset",
                                self.config.data_dir)
     name = self.config.name
     languages = _LANGS[name]
     return ([
         ds.SplitGenerator(
             name=ds.Split.TRAIN,
             gen_kw={
                 "data_file": os.path.join(data_folder,
                                           _PATHS[name]["train"]),
                 "split": "train",
             },
         ),
     ] + [
         ds.SplitGenerator(
             name=ds.Split(f"validation.{c}"),
             gen_kw={
                 "data_file":
                 os.path.join(data_folder, _PATHS[name]["dev"].format(c)),
                 "split":
                 "dev",
             },
         ) for c in languages
     ] + [
         ds.SplitGenerator(
             name=ds.Split(f"test.{x}"),
             gen_kw={
                 "data_file":
                 os.path.join(data_folder, _PATHS[name]["test"].format(x)),
                 "split":
                 "test",
             },
         ) for x in languages
     ])
Ejemplo n.º 15
0
    def _split_generators(self, dl_manager):
        archive = dl_manager.download(_XGLUE_ALL_DATA)
        data_folder = f"xglue_full_dataset/{self.config.data_dir}"
        name = self.config.name

        languages = _LANGUAGES[name]
        return ([
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "archive": dl_manager.iter_archive(archive),
                    "data_path": f"{data_folder}/{_PATHS[name]['train']}",
                    "split": "train",
                },
            ),
        ] + [
            datasets.SplitGenerator(
                name=datasets.Split(f"validation.{lang}"),
                gen_kwargs={
                    "archive": dl_manager.iter_archive(archive),
                    "data_path":
                    f"{data_folder}/{_PATHS[name]['dev'].format(lang)}",
                    "split": "dev",
                },
            ) for lang in languages
        ] + [
            datasets.SplitGenerator(
                name=datasets.Split(f"test.{lang}"),
                gen_kwargs={
                    "archive": dl_manager.iter_archive(archive),
                    "data_path":
                    f"{data_folder}/{_PATHS[name]['test'].format(lang)}",
                    "split": "test",
                },
            ) for lang in languages
        ])
Ejemplo n.º 16
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     data_dir = dl_manager.download_and_extract(_URLs)
     return [
         datasets.SplitGenerator(
             name=datasets.Split(key),
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": data_dir[key],
                 "pid2name": data_dir["pid2name"],
                 "return_names": key
                 in ["train_wiki", "val_wiki", "val_nyt"],
             },
         ) for key in data_dir.keys() if key != "pid2name"
     ]
Ejemplo n.º 17
0
 def _split_generators(self, dl_manager):
     arch_path = dl_manager.download_and_extract(_DOWNLOAD_URL)
     data_dir = os.path.join(arch_path, "aclImdb")
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN, gen_kwargs={"directory": os.path.join(data_dir, "train")}
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST, gen_kwargs={"directory": os.path.join(data_dir, "test")}
         ),
         datasets.SplitGenerator(
             name=datasets.Split("unsupervised"),
             gen_kwargs={"directory": os.path.join(data_dir, "train"), "labeled": False},
         ),
     ]
Ejemplo n.º 18
0
    def _split_generators(self, dl_manager):
        arch_path = dl_manager.download_and_extract(self.config.data_url)

        if "relations" in self.config.name:
            train_file = "train.csv"
            test_file = "test.csv"

            generators = []

            # for k in [1, 2, 3, 4]:
            for aspect in self.config.aspects:
                for k in ["sample"] + [1, 2, 3, 4]:
                    folds_path = os.path.join(arch_path, 'folds', aspect,
                                              str(k))
                    generators += [
                        datasets.SplitGenerator(
                            name=get_train_split(aspect, k),
                            gen_kwargs={
                                'filepath':
                                os.path.join(folds_path, train_file)
                            }),
                        datasets.SplitGenerator(name=get_test_split(aspect, k),
                                                gen_kwargs={
                                                    'filepath':
                                                    os.path.join(
                                                        folds_path, test_file)
                                                })
                    ]
            return generators

        elif "docs" in self.config.name:
            # docs
            docs_file = os.path.join(arch_path, "docs.jsonl")

            return [
                datasets.SplitGenerator(name=datasets.Split('docs'),
                                        gen_kwargs={"filepath": docs_file}),
            ]
        else:
            raise ValueError()
Ejemplo n.º 19
0
    def _split_generators(self, dl_manager):
        urls = _URLS
        data_dir = dl_manager.download_and_extract(urls)

        return [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir["train"],
                    "split": "train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir["dev"],
                    "split": "dev",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir["test_unlabeled"],
                    "split": "test_unlabeled"
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("test_ood"),
                # These kwargs will be passed to _generate_examples
                gen_kwargs={
                    "filepath": data_dir["ood_unlabeled"],
                    "split": "ood_unlabeled"
                },
            ),
        ]
Ejemplo n.º 20
0
def get_test_split(k):
    return datasets.Split(f'fold_{k}_test')
Ejemplo n.º 21
0
def get_train_split(k):
    return datasets.Split(f'fold_{k}_train')
Ejemplo n.º 22
0
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        dl_dir = dl_manager.download_and_extract(_URL)

        return [
            datasets.SplitGenerator(
                name=datasets.Split("exercise_contest_train"),
                gen_kwargs={
                    "filepath":
                    os.path.join(
                        dl_dir,
                        "final_all_data/exercise_contest/data_train.json"),
                    "split":
                    "exercise_contest_train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("exercise_contest_valid"),
                gen_kwargs={
                    "filepath":
                    os.path.join(
                        dl_dir,
                        "final_all_data/exercise_contest/data_valid.json"),
                    "split":
                    "exercise_contest_valid",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("exercise_contest_test"),
                gen_kwargs={
                    "filepath":
                    os.path.join(
                        dl_dir,
                        "final_all_data/exercise_contest/data_test.json"),
                    "split":
                    "exercise_contest_test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("first_stage_train"),
                gen_kwargs={
                    "filepath":
                    os.path.join(dl_dir,
                                 "final_all_data/first_stage/train.json"),
                    "split":
                    "first_stage_train",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("first_stage_test"),
                gen_kwargs={
                    "filepath":
                    os.path.join(dl_dir,
                                 "final_all_data/first_stage/test.json"),
                    "split":
                    "first_stage_test",
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split("final_test"),
                gen_kwargs={
                    "filepath":
                    os.path.join(dl_dir, "final_all_data/final_test.json"),
                    "split":
                    "final_test"
                },
            ),
        ]
Ejemplo n.º 23
0
def main():
    # Auto-environment
    env = get_env()

    parser = HfArgumentParser(
        (ModelArguments, TrainingArguments, ExperimentArguments))
    model_args, training_args, experiment_args = parser.parse_args_into_dataclasses(
    )

    # Adjust output with folds and model name
    #TODO disabled
    # training_args.output_dir = os.path.join(training_args.output_dir, str(experiment_args.cv_fold), model_args.get_model_name())

    # Model path from env
    if not os.path.exists(model_args.model_name_or_path) and os.path.exists(
            os.path.join(env['bert_dir'], model_args.model_name_or_path)):
        model_args.model_name_or_path = os.path.join(
            env['bert_dir'], model_args.model_name_or_path)

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Dataset args
    label_classes = get_label_classes_from_hf_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset))
    num_labels = len(label_classes)

    if num_labels > 1 and experiment_args.binary_classification:
        # In binary classification we have only single label (with y=[0;1])
        num_labels = 1
        logger.warning(f'Forcing label classes to binary: {label_classes}')

    columns = ['input_ids', 'attention_mask', 'token_type_ids',
               'labels']  # Input to transformers.forward

    # Build dataset for splits
    train_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_train_split(experiment_args.aspect, experiment_args.cv_fold))
    test_ds = load_dataset(
        get_local_hf_dataset_path(experiment_args.hf_dataset),
        name='relations',
        cache_dir=experiment_args.hf_dataset_cache_dir,
        split=get_test_split(experiment_args.aspect, experiment_args.cv_fold))
    docs_ds = load_dataset(get_local_hf_dataset_path(
        experiment_args.hf_dataset),
                           name='docs',
                           cache_dir=experiment_args.hf_dataset_cache_dir,
                           split=datasets.Split('docs'))

    # Forced limit
    if experiment_args.dataset_limit > 0:
        logger.info(
            f'Train and test datasets limited to {experiment_args.dataset_limit} samples'
        )

        train_ds = Dataset(train_ds.data[:experiment_args.dataset_limit])
        test_ds = Dataset(test_ds.data[:experiment_args.dataset_limit])

    # Build ID => Doc mapping
    doc_id2doc = {doc[experiment_args.doc_id_col]: doc for doc in docs_ds}

    if model_args.model_name_or_path.startswith('baseline-rnn'):
        # Load Spacy as tokenizer
        spacy_nlp = spacy.load(experiment_args.spacy_model,
                               disable=["tagger", "ner", "textcat"])

        if experiment_args.multi_label:
            # Baseline models
            model = RNNForMultiLabelSequenceClassification(
                word_vectors=get_vectors_from_spacy_model(spacy_nlp),
                hidden_size=experiment_args.rnn_hidden_size,
                rnn=experiment_args.rnn_type,
                num_labels=num_labels,
                num_layers=experiment_args.rnn_num_layers,
                dropout=experiment_args.rnn_dropout,
            )
        else:
            raise NotImplementedError(
                'RNN baseline is only available for multi label classification'
            )

        tokenizer = None

    else:
        # Load pretrained Transformers models and tokenizers
        model_config = AutoConfig.from_pretrained(
            model_args.model_name_or_path,
            num_labels=num_labels,
            cache_dir=model_args.cache_dir)

        # No need for spacy
        spacy_nlp = None

        if 'longformer' in model_args.model_name_or_path:
            # TVM: a custom CUDA kernel implementation of our sliding window attention (works only on GPU)
            model_config.attention_mode = 'tvm'

            # override tokenizer name if not set
            if model_args.tokenizer_name is None:
                roberta_path = os.path.join(env['bert_dir'], 'roberta-base')
                model_args.tokenizer_name = roberta_path if os.path.exists(
                    roberta_path) else 'roberta-base'

                logger.info(
                    f'Overriding tokenizer: {model_args.tokenizer_name}')

            # override max length
            experiment_args.max_length = 4096

        if experiment_args.multi_label:
            model_cls = AutoModelForMultiLabelSequenceClassification
        else:
            model_cls = AutoModelForSequenceClassification

        model = model_cls.from_pretrained(model_args.model_name_or_path,
                                          config=model_config,
                                          cache_dir=model_args.cache_dir)
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        # Set token limit if defined by model (for Longformer)
        if model.config.max_position_embeddings > 0:
            tokenizer.model_max_length = model.config.max_position_embeddings

    # Init helper
    dpt = DocRelTrainerHelper(
        id2doc=doc_id2doc,
        transformers_tokenizer=tokenizer,
        spacy_nlp=spacy_nlp,
        label_classes=label_classes,
        binary_classification=experiment_args.binary_classification,
        doc_a_col=experiment_args.doc_a_col,
        doc_b_col=experiment_args.doc_b_col,
        label_col=experiment_args.label_col,
        text_from_doc_func=get_non_empty_text_from_doc,
        classification_threshold=experiment_args.classification_threshold,
        max_length=experiment_args.max_length,
        multi_label=experiment_args.multi_label,
    )

    logger.info('Converting to features (doc mapping, tokenize, ...)')

    # Build hash from settings for caching
    data_settings_hash = hashlib.md5(
        dataclasses.asdict(experiment_args).__str__().encode("utf-8") +
        dataclasses.asdict(model_args).__str__().encode("utf-8")).hexdigest()

    train_tensor_ds = train_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-train-" + data_settings_hash + ".arrow"))
    train_tensor_ds.set_format(type='torch', columns=columns)

    test_tensor_ds = test_ds.map(
        dpt.convert_to_features,
        batched=True,
        load_from_cache_file=True,
        num_proc=int(env['workers']),
        cache_file_name=os.path.join(
            experiment_args.hf_dataset_cache_dir,
            "cache-test-" + data_settings_hash + ".arrow"))
    test_tensor_ds.set_format(type='torch', columns=columns)

    logger.info(f'Dataset columns: {columns}')
    logger.info(f'Train sample: {train_ds[0]}')
    logger.debug(f'- as tensor: {train_tensor_ds[0]}')

    logger.info(f'Test sample: {test_ds[0]}')
    logger.debug(f'- as tensor: {test_tensor_ds[0]}')

    # Load models weights (when no training but predictions)
    model_weights_path = os.path.join(training_args.output_dir,
                                      'pytorch_model.bin')

    if not training_args.do_train and experiment_args.save_predictions:
        logger.info(
            f'Loading existing model weights from disk: {model_weights_path}')
        if os.path.exists(model_weights_path):
            model.load_state_dict(torch.load(model_weights_path))
        else:
            logger.error('Weights files does not exist!')

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tensor_ds,
        eval_dataset=test_tensor_ds,
        data_collator=DocRelDataCollator(),
        #prediction_loss_only=False,
        compute_metrics=dpt.compute_metrics,
    )

    # Log additional (to Weights & Baises)
    if is_wandb_available():
        extra_config = {}
        extra_config.update(dataclasses.asdict(experiment_args))
        extra_config.update(dataclasses.asdict(model_args))

        wandb.config.update(extra_config, allow_val_change=True)

    if training_args.do_train:
        logger.info('Training started...')
        trainer.train()

        if isinstance(model, PreTrainedModel):
            trainer.save_model()
            tokenizer.save_pretrained(training_args.output_dir)

        elif isinstance(model, nn.Module):  # RNN model
            torch.save(model.state_dict(), model_weights_path)

    if experiment_args.save_predictions:
        logger.info('Predicting...')

        predictions = trainer.predict(test_tensor_ds)

        df = dpt.get_df_from_predictions(test_ds,
                                         docs_ds,
                                         predictions,
                                         exclude_columns=['abstract'])

        # Save results to disk
        df.to_csv(os.path.join(training_args.output_dir, 'results.csv'),
                  index=False)
        json.dump(
            predictions.metrics,
            open(os.path.join(training_args.output_dir, 'metrics.json'), 'w'))

    logger.info('Done')
Ejemplo n.º 24
0
def get_test_split(aspect, k):
    return datasets.Split(f'fold_{aspect}_{k}_test')
Ejemplo n.º 25
0
def get_train_split(aspect, k):
    return datasets.Split(f'fold_{aspect}_{k}_train')
Ejemplo n.º 26
0
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     qanta_path = dl_manager.download_and_extract(_QANTA_URL)
     trick_path = dl_manager.download_and_extract(_TRICK_URL)
     return [
         datasets.SplitGenerator(
             name=datasets.Split("guesstrain"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guesstrain",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("buzztrain"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzztrain",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("guessdev"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guessdev",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("buzzdev"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzzdev",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("guesstest"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "guesstest",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("buzztest"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "buzztest",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split("adversarial"),
             gen_kwargs={
                 "qanta_filepath": qanta_path,
                 "trick_filepath": trick_path,
                 "fold": "adversarial",
                 "mode": self.config.mode,
                 "char_skip": self.config.char_skip,
             },
         ),
     ]
Ejemplo n.º 27
0
    def _split_generators(self, dl_manager):
        if '_cross_validation_' in self.config.name:
            return self._split_generators_for_cross_validation(dl_manager)

        dl_dir = dl_manager.download_and_extract(self.config.data_url) or ""
        task_name = _get_task_name_from_data_url(self.config.data_url)
        dl_dir = os.path.join(dl_dir, task_name)
        if self.config.name in ["axb", "axg"]:
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TEST,
                    gen_kwargs={
                        "data_file":
                        os.path.join(dl_dir, "{}.jsonl".format(task_name)),
                        "split":
                        datasets.Split.TEST,
                    },
                ),
            ]

        if self.config.train_path is not None:
            train_path = dl_manager.download_and_extract(
                self.config.train_path)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={
                        "data_file": train_path,
                        "split": datasets.Split.TRAIN,
                    },
                )
            ]
        elif self.config.is_few_shot:
            train_path = dl_manager.download_and_extract(
                os.path.join(self.config.few_shot_url, "train.jsonl"))
        else:
            train_path = os.path.join(dl_dir, "train.jsonl")

        splits = [
            datasets.SplitGenerator(
                name=datasets.Split.TRAIN,
                gen_kwargs={
                    "data_file": train_path,
                    "split": datasets.Split.TRAIN,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.VALIDATION,
                gen_kwargs={
                    "data_file": os.path.join(dl_dir, "val.jsonl"),
                    "split": datasets.Split.VALIDATION,
                },
            ),
            datasets.SplitGenerator(
                name=datasets.Split.TEST,
                gen_kwargs={
                    "data_file": os.path.join(dl_dir, "test.jsonl"),
                    "split": datasets.Split.TEST,
                },
            ),
        ]

        if self.config.is_few_shot:
            splits.append(
                datasets.SplitGenerator(
                    name=datasets.Split("unlabeled"),
                    gen_kwargs={
                        "data_file":
                        (os.path.join(dl_dir, "unlabeled.jsonl")
                         if not self.config.is_few_shot else
                         dl_manager.download_and_extract(
                             os.path.join(self.config.few_shot_url,
                                          "unlabeled.jsonl"))),
                        "split":
                        datasets.Split("unlabeled"),
                    },
                ))

        return splits
Ejemplo n.º 28
0
parser.add_argument("--RATE", type=float, default=0.5, help="learning rate")
parser.add_argument("--BACKEND", default="cpu", help="backend mode")
parser.add_argument("--DATASET", default="simple", help="dataset")
parser.add_argument("--PLOT", default=False, help="dataset")

args = parser.parse_args()


PTS = args.PTS

if args.DATASET == "xor":
    DATASET = datasets.Xor(PTS, vis=True)
elif args.DATASET == "simple":
    DATASET = datasets.Simple(PTS, vis=True)
elif args.DATASET == "split":
    DATASET = datasets.Split(PTS, vis=True)

HIDDEN = int(args.HIDDEN)
RATE = args.RATE


# Change which backend to use

if args.BACKEND == "cpu":
    BACKEND = minitorch.make_tensor_backend(minitorch.FastOps)
elif args.BACKEND == "old":
    # Module-2 backend
    # You can use this to debug, but you will need to add a
    # Matrix multiplication @ operator
    BACKEND = minitorch.TensorFunctions
elif args.BACKEND == "gpu":