def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
     urls_to_download = {
         "train": "{}/stsb-{}-train.csv".format(_BASE_URL,
                                                self.config.name),
         "dev": "{}/stsb-{}-dev.csv".format(_BASE_URL, self.config.name),
         "test": "{}/stsb-{}-test.csv".format(_BASE_URL, self.config.name),
     }
     downloaded_files = dl_manager.download(urls_to_download)
     return [
         datasets.SplitGenerator(
             name=datasets.Split.TRAIN,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": downloaded_files["train"],
             },
         ),
         datasets.SplitGenerator(
             name=datasets.Split.TEST,
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": downloaded_files["test"],
             },
         ),
         datasets.SplitGenerator(
             name=datasets.NamedSplit("dev"),
             # These kwargs will be passed to _generate_examples
             gen_kwargs={
                 "filepath": downloaded_files["dev"],
             },
         ),
     ]
Exemple #2
0
    def _split_generators(self, dl_manager):
        if self.config.name == "asr":
            _DL_URLS = {
                "dev": self.config.data_url + "dev-clean.tar.gz",
                "test": self.config.data_url + "test-clean.tar.gz",
                "train": self.config.data_url + "train-clean-100.tar.gz",
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)

            return [
                datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"archive_path": archive_path["train"]}),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION, gen_kwargs={"archive_path": archive_path["dev"]}
                ),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"]}),
            ]
        elif self.config.name == "ks":
            _DL_URLS = {
                "train_val_test": self.config.data_url.format(filename="speech_commands_v0.01.tar.gz"),
                "test": self.config.data_url.format(filename="speech_commands_test_set_v0.01.tar.gz"),
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "train"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": archive_path["train_val_test"], "split": "val"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path["test"], "split": "test"}
                ),
            ]
        elif self.config.name == "ic":
            archive_path = dl_manager.download_and_extract(self.config.data_url)
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": archive_path, "split": "train"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": archive_path, "split": "valid"},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.TEST, gen_kwargs={"archive_path": archive_path, "split": "test"}
                ),
            ]
        elif self.config.name == "si":
            manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
            return [
                datasets.SplitGenerator(
                    name=datasets.Split.TRAIN,
                    gen_kwargs={"archive_path": manual_dir, "split": 1},
                ),
                datasets.SplitGenerator(
                    name=datasets.Split.VALIDATION,
                    gen_kwargs={"archive_path": manual_dir, "split": 2},
                ),
                datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"archive_path": manual_dir, "split": 3}),
            ]
        elif self.config.name == "sd":
            splits = ["train", "dev", "test"]
            _DL_URLS = {
                split: {
                    filename: self.config.data_url.format(split=split, filename=filename)
                    for filename in ["reco2dur", "segments", "utt2spk", "wav.zip"]
                }
                for split in splits
            }
            archive_path = dl_manager.download_and_extract(_DL_URLS)
            return [
                datasets.SplitGenerator(
                    name=datasets.NamedSplit(split), gen_kwargs={"archive_path": archive_path[split], "split": split}
                )
                for split in splits
            ]
        elif self.config.name == "er":
            manual_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
            return [
                datasets.SplitGenerator(
                    name=f"session{i}",
                    gen_kwargs={"archive_path": manual_dir, "split": i},
                )
                for i in range(1, 6)
            ]